# Course: 5210 Data Visualization
# Purpose: Technical Appendix for Midterm Project
# Date: July 29, 2021
# Author: Jennifer Grosz and Josh Wolfe
# Clear environment of variables and functions
rm(list = ls(all = TRUE))
# Clear environment of packages
if(is.null(sessionInfo()$otherPkgs) == FALSE)lapply(paste("package:", names(sessionInfo()$otherPkgs), sep=""), detach, character.only = TRUE, unload = TRUE)
# Load relevant packages used for analyzing General Mills data
library(tidyverse) # contains ggplot2, dplyr, and several other packages used
library(vtable) # contains vtable function for summary table of data
library(janitor) # contains tidyverse functions for cross-tables
library(gridExtra) # contains grid.arrange function used to combine plots in the same window
library(patchwork) # use to put graphs together in the same frame
library(knitr) # contains some table formatting functions
library(kableExtra) # also contains functions used for table outputs
library(GGally) # contains a custom correlation plot
library(moments) # to calculate skewness
library(scales) # use for rounding numbers
library(corrplot) # for correlation plot
library(tidylog) # provides data set information reports
library(tidyr) # provides uncount function
library(Hmisc)
library(MultinomialCI)
library(htmlTable)
library(colorspace)
options(scipen = 999) # remove scientific notation
# load product data
product_data <- read.csv("../data/mtp_product_data.csv")
# load sales data
sales_data <- read.csv("../data/mtp_sales_data.csv")
# Need to mutate UPC values to merge data sets because they're different in each data set. For example:
#
# - sales_data has UPC as 01.16000.11653
# - product_data has UPC as 00-01-16000-11653
# change UPC to merge data sets
product_data <- product_data %>%
# start by separating values by -."
separate("UPC", into = c("first", "second", "third", "fourth"), sep = "-") %>%
# unite columns to make UPC the same as it is in sales_data
unite(col = "UPC", c("second", "third", "fourth"), sep = ".") %>%
# drop column with leading zeros from product_data's UPC
select(-first)
## select: dropped one variable (first)
# Join data sets
gm_joined_data <- right_join(sales_data, product_data, by = "UPC")
## right_join: added 4 columns (brand, flavor, volume, package)
## > rows only in x ( 0)
## > rows only in y 0
## > matched rows 21,850 (includes duplicates)
## > ========
## > rows total 21,850
# Create cereal variable
# Reduce brand variable to just brand name
gm_joined_data <- gm_joined_data %>%
mutate(
cereal = case_when(
str_sub(brand, 1, 7) == "GENERAL" ~ str_sub(brand, 15, -1),
str_sub(brand, 1, 8) == "KELLOGGS" ~ str_sub(brand, 10, -1),
str_sub(brand, 1, 4) == "POST" ~ str_sub(brand, 6, -1)),
brand = case_when(
str_sub(brand, 1, 7) == "GENERAL" ~ "GENERAL MILLS",
str_sub(brand, 1, 8) == "KELLOGGS" ~ "KELLOGGS",
str_sub(brand, 1, 4) == "POST" ~ "POST")
)
## mutate: changed 21,850 values (100%) of 'brand' (0 new NA)
## new variable 'cereal' (character) with 15 unique values and 0% NA
gm_only <- subset(gm_joined_data, brand == "GENERAL MILLS")
# Convert variables to factors
gm_joined_data[,'promo'] <- factor(gm_joined_data[,'promo'])
gm_joined_data[,'ad'] <- factor(gm_joined_data[,'ad'])
gm_joined_data[,'brand'] <- factor(gm_joined_data[,'brand'])
gm_joined_data[,'cereal'] <- factor(gm_joined_data[,'cereal'])
gm_joined_data[,'flavor'] <- factor(gm_joined_data[,'flavor'])
gm_joined_data[,'package'] <- factor(gm_joined_data[,'package'])
gm_joined_data[,'iri_key'] <- factor(gm_joined_data[,'iri_key'])
gm_only[,'promo'] <- factor(gm_only[,'promo'])
gm_only[,'ad'] <- factor(gm_only[,'ad'])
gm_only[,'brand'] <- factor(gm_only[,'brand'])
gm_only[,'cereal'] <- factor(gm_only[,'cereal'])
gm_only[,'flavor'] <- factor(gm_only[,'flavor'])
gm_only[,'package'] <- factor(gm_only[,'package'])
gm_only[,'iri_key'] <- factor(gm_only[,'iri_key'])
# Create total sales data frame
gm_total_sales <- uncount(gm_joined_data, units)
## uncount: now 187,450 rows and 11 columns, ungrouped
gm_only <- uncount(gm_only, units)
## uncount: now 69,017 rows and 11 columns, ungrouped
# Look at the top few rows of the data
head(gm_joined_data)
## UPC iri_key week units price promo ad brand
## 1 01.16000.11653 644347 6 5 0.5 0 A GENERAL MILLS
## 2 01.16000.11653 248741 5 2 0.5 0 NONE GENERAL MILLS
## 3 01.16000.11653 535806 11 3 0.5 0 NONE GENERAL MILLS
## 4 01.16000.11945 675634 11 2 0.5 0 NONE GENERAL MILLS
## 5 01.16000.11945 205272 13 8 0.5 0 NONE GENERAL MILLS
## 6 01.16000.11945 248741 14 5 0.5 0 NONE GENERAL MILLS
## flavor volume package cereal
## 1 CINNAMON TOAST 0.06 BOX CINNAMON TST CR
## 2 CINNAMON TOAST 0.06 BOX CINNAMON TST CR
## 3 CINNAMON TOAST 0.06 BOX CINNAMON TST CR
## 4 TOASTED 0.04 BOX CHEERIOS
## 5 TOASTED 0.04 BOX CHEERIOS
## 6 TOASTED 0.04 BOX CHEERIOS
Data appears to be tidy
# Get a breakdown of the variables
str(gm_joined_data)
## 'data.frame': 21850 obs. of 12 variables:
## $ UPC : chr "01.16000.11653" "01.16000.11653" "01.16000.11653" "01.16000.11945" ...
## $ iri_key: Factor w/ 1420 levels "200171","200197",..: 1041 446 1018 1217 48 446 1295 794 1184 1043 ...
## $ week : int 6 5 11 11 13 14 39 35 45 5 ...
## $ units : int 5 2 3 2 8 5 6 1 4 14 ...
## $ price : num 0.5 0.5 0.5 0.5 0.5 0.5 1.09 1.59 1.59 1 ...
## $ promo : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ ad : Factor w/ 3 levels "A","B","NONE": 1 3 3 3 3 3 3 3 3 3 ...
## $ brand : Factor w/ 3 levels "GENERAL MILLS",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ flavor : Factor w/ 5 levels "CINNAMON TOAST",..: 1 1 1 5 5 5 1 1 1 1 ...
## $ volume : num 0.06 0.06 0.06 0.04 0.04 0.04 0.12 0.12 0.12 0.12 ...
## $ package: Factor w/ 2 levels "BOX","CUP": 1 1 1 1 1 1 2 2 2 2 ...
## $ cereal : Factor w/ 15 levels "CHEERIOS","CINNAMON TST CR",..: 2 2 2 1 1 1 2 2 2 2 ...
# Get a breakdown of the variables
vtable(gm_joined_data)
| Name | Class | Values |
|---|---|---|
| UPC | character | |
| iri_key | factor | ‘200171’ ‘200197’ ‘200272’ ‘200297’ ‘200341’ and more |
| week | integer | Num: 1 to 52 |
| units | integer | Num: 1 to 28 |
| price | numeric | Num: 0.25 to 9.99 |
| promo | factor | ‘0’ ‘1’ |
| ad | factor | ‘A’ ‘B’ ‘NONE’ |
| brand | factor | ‘GENERAL MILLS’ ‘KELLOGGS’ ‘POST’ |
| flavor | factor | ‘CINNAMON TOAST’ ‘COCOA’ ‘FRUIT’ ‘REGULAR’ ‘TOASTED’ |
| volume | numeric | Num: 0.04 to 4 |
| package | factor | ‘BOX’ ‘CUP’ |
| cereal | factor | ‘CHEERIOS’ ‘CINNAMON TST CR’ ‘COCOA KRISPIES’ ‘COCOA PUFFS’ ‘FROOT LOOPS’ and more |
Two Numeric variables
Two integer variables
Seven factor variables
# Get variable descriptive statistics
summary(gm_joined_data)
## UPC iri_key week units
## Length:21850 656444 : 35 Min. : 1.00 Min. : 1.000
## Class :character 256951 : 31 1st Qu.:14.00 1st Qu.: 3.000
## Mode :character 259661 : 31 Median :27.00 Median : 7.000
## 267403 : 31 Mean :26.62 Mean : 8.579
## 652139 : 31 3rd Qu.:40.00 3rd Qu.:12.000
## 681735 : 31 Max. :52.00 Max. :28.000
## (Other):21660
## price promo ad brand
## Min. :0.250 0:17305 A : 1456 GENERAL MILLS: 7189
## 1st Qu.:3.190 1: 4545 B : 1061 KELLOGGS :12183
## Median :3.790 NONE:19333 POST : 2478
## Mean :3.763
## 3rd Qu.:4.390
## Max. :9.990
##
## flavor volume package cereal
## CINNAMON TOAST:1834 Min. :0.040 BOX:21306 FROSTED FLAKES : 2295
## COCOA :1901 1st Qu.:0.750 CUP: 544 FROOT LOOPS : 2192
## FRUIT :2192 Median :1.060 CINNAMON TST CR : 1834
## REGULAR :8816 Mean :1.016 LUCKY CHARMS : 1681
## TOASTED :7107 3rd Qu.:1.120 FROSTED MINI WHEATS: 1574
## Max. :4.000 CHEERIOS : 1458
## (Other) :10816
Comments and questions about the data
The categorical variables in this data set are iri_key (store key), week, brand, flavor, package, promo, and ad
To better understand these variables will look at two elements:
absolute count
relative proportion or percent
gm_joined_data %>%
group_by(iri_key) %>%
summarise(count = n(), # make calculations to summarize variable and create counting function
total_units_sold = sum(units), # calculate total number of units sold
proportion = sum(count) / nrow(gm_joined_data), # Calculate proportion
percent = sum(count) / nrow(gm_joined_data)) %>%# create percent variable
arrange(-percent) %>% # display table results by percent in descending order
mutate(percent = percent(percent))
## # A tibble: 1,420 x 5
## iri_key count total_units_sold proportion percent
## <fct> <int> <int> <dbl> <chr>
## 1 656444 35 170 0.00160 0.1602%
## 2 256951 31 308 0.00142 0.1419%
## 3 259661 31 173 0.00142 0.1419%
## 4 267403 31 260 0.00142 0.1419%
## 5 652139 31 167 0.00142 0.1419%
## 6 681735 31 235 0.00142 0.1419%
## 7 1094689 31 249 0.00142 0.1419%
## 8 651600 30 210 0.00137 0.1373%
## 9 240692 29 210 0.00133 0.1327%
## 10 247300 29 268 0.00133 0.1327%
## # ... with 1,410 more rows
# there are 1420 separate stores in this data set
gm_joined_data %>%
group_by(iri_key) %>%
summarise(count = n(), # make calculations to summarize variable and create counting function
total_units_sold = sum(units), # calculate total number of units sold
proportion = sum(count) / nrow(gm_joined_data), # Calculate proportion
percent = sum(count) / nrow(gm_joined_data)) %>%# create percent variable
arrange(-percent) %>% # display table results by percent in descending order
mutate(percent = percent(percent))
## group_by: one grouping variable (iri_key)
## summarise: now 1,420 rows and 5 columns, ungrouped
## mutate: converted 'percent' from double to character (0 new NA)
## # A tibble: 1,420 x 5
## iri_key count total_units_sold proportion percent
## <fct> <int> <int> <dbl> <chr>
## 1 656444 35 170 0.00160 0.1602%
## 2 256951 31 308 0.00142 0.1419%
## 3 259661 31 173 0.00142 0.1419%
## 4 267403 31 260 0.00142 0.1419%
## 5 652139 31 167 0.00142 0.1419%
## 6 681735 31 235 0.00142 0.1419%
## 7 1094689 31 249 0.00142 0.1419%
## 8 651600 30 210 0.00137 0.1373%
## 9 240692 29 210 0.00133 0.1327%
## 10 247300 29 268 0.00133 0.1327%
## # ... with 1,410 more rows
# there are 1420 separate stores in this data set
Comments:
- distribution of observations per store looks fairly even
- No single store makes up a significant proportion or percent of this data set, highest count is 35 followed by a group of stores with 31 observations
- There are a number of stores with less than 10 observations, so we might have too small of a sample size for those stores
week <- gm_joined_data %>%
group_by(week) %>%
summarise(count = n(), # make calculations to summarize variable and create counting function
proportion = sum(count) / nrow(gm_joined_data), # Calculate proportion
percent = sum(count) / nrow(gm_joined_data)) %>%# create percent variable
arrange(-percent) %>% # display table results by percent in descending order
mutate(percent = percent(percent))
# there are 1420 separate stores in this data set
| week | count | proportion | percent |
|---|---|---|---|
| 22 | 475 | 0.0217391 | 2.1739% |
| 12 | 473 | 0.0216476 | 2.1648% |
| 37 | 464 | 0.0212357 | 2.1236% |
| 29 | 449 | 0.0205492 | 2.0549% |
| 20 | 445 | 0.0203661 | 2.0366% |
| 43 | 445 | 0.0203661 | 2.0366% |
| 48 | 445 | 0.0203661 | 2.0366% |
| 38 | 441 | 0.0201831 | 2.0183% |
| 40 | 441 | 0.0201831 | 2.0183% |
| 44 | 441 | 0.0201831 | 2.0183% |
| 30 | 439 | 0.0200915 | 2.0092% |
| 32 | 439 | 0.0200915 | 2.0092% |
| 45 | 439 | 0.0200915 | 2.0092% |
| 36 | 436 | 0.0199542 | 1.9954% |
| 7 | 435 | 0.0199085 | 1.9908% |
| 18 | 435 | 0.0199085 | 1.9908% |
| 27 | 432 | 0.0197712 | 1.9771% |
| 47 | 431 | 0.0197254 | 1.9725% |
| 14 | 429 | 0.0196339 | 1.9634% |
| 49 | 429 | 0.0196339 | 1.9634% |
| 6 | 428 | 0.0195881 | 1.9588% |
| 5 | 427 | 0.0195423 | 1.9542% |
| 24 | 425 | 0.0194508 | 1.9451% |
| 28 | 424 | 0.0194050 | 1.9405% |
| 25 | 423 | 0.0193593 | 1.9359% |
| 13 | 421 | 0.0192677 | 1.9268% |
| 19 | 421 | 0.0192677 | 1.9268% |
| 26 | 420 | 0.0192220 | 1.9222% |
| 46 | 420 | 0.0192220 | 1.9222% |
| 42 | 418 | 0.0191304 | 1.9130% |
| 17 | 414 | 0.0189474 | 1.8947% |
| 39 | 414 | 0.0189474 | 1.8947% |
| 11 | 412 | 0.0188558 | 1.8856% |
| 41 | 412 | 0.0188558 | 1.8856% |
| 4 | 409 | 0.0187185 | 1.8719% |
| 31 | 409 | 0.0187185 | 1.8719% |
| 16 | 403 | 0.0184439 | 1.8444% |
| 3 | 402 | 0.0183982 | 1.8398% |
| 2 | 401 | 0.0183524 | 1.8352% |
| 51 | 400 | 0.0183066 | 1.8307% |
| 23 | 399 | 0.0182609 | 1.8261% |
| 35 | 399 | 0.0182609 | 1.8261% |
| 9 | 398 | 0.0182151 | 1.8215% |
| 15 | 397 | 0.0181693 | 1.8169% |
| 34 | 394 | 0.0180320 | 1.8032% |
| 33 | 393 | 0.0179863 | 1.7986% |
| 10 | 389 | 0.0178032 | 1.7803% |
| 21 | 389 | 0.0178032 | 1.7803% |
| 52 | 388 | 0.0177574 | 1.7757% |
| 8 | 386 | 0.0176659 | 1.7666% |
| 1 | 383 | 0.0175286 | 1.7529% |
| 50 | 369 | 0.0168879 | 1.6888% |
Comments:
- distribution of observations per week looks fairly even
- No single week makes up a significant proportion or percent of this data set, highest count is 475 and the lowest count is 369
- might want to convert weeks into month/date values so we could look for seasonal or quarterly trends
promo <- gm_joined_data %>%
group_by(promo) %>%
summarise(count = n(), # make calculations to summarize variable and create counting function
proportion = round(sum(count) / nrow(gm_joined_data), 2),# Calculate proportion
percent = sum(count) / nrow(gm_joined_data)) %>%# create percent variable
arrange(-percent) %>% # display table results by percent in descending order
mutate(percent = percent(percent)) # convert to percentage
| promo | count | proportion | percent |
|---|---|---|---|
| 0 | 17305 | 0.79 | 79% |
| 1 | 4545 | 0.21 | 21% |
Comments:
- significantly more observations in this data set were recorded when there was not an in store promotion than when there was a promotion
- 79% of the observations are no in-store promotions sales
- 21% of the observations are in-store promotions sales
ad <- gm_joined_data %>%
group_by(ad) %>%
summarise(count = n(), # make calculations to summarize variable and create counting function
proportion = round(sum(count) / nrow(gm_joined_data), 2),# Calculate proportion
percent = sum(count) / nrow(gm_joined_data)) %>%# create percent variable
arrange(-percent) %>% # display table results by percent in descending order
mutate(percent = percent(percent)) # convert to percentage
| ad | count | proportion | percent |
|---|---|---|---|
| NONE | 19333 | 0.88 | 88.5% |
| A | 1456 | 0.07 | 6.7% |
| B | 1061 | 0.05 | 4.9% |
Comments:
- significantly more observations in this data set were recorded when there was not an advertisements
- 88.5% of the observations are no advertisement sales
- 6.7% of the observations are advertisement A - "medium ad" sales
- 4.9% of the observations are advertisement B - "small ad" sales
brand <- gm_joined_data %>%
group_by(brand) %>%
summarise(count = n(), # make calculations to summarize variable and create counting function
proportion = round(sum(count) / nrow(gm_joined_data), 2),# Calculate proportion
percent = sum(count) / nrow(gm_joined_data)) %>%# create percent variable
arrange(-percent) %>% # display table results by percent in descending order
mutate(percent = percent(percent)) # convert to percentage
| brand | count | proportion | percent |
|---|---|---|---|
| KELLOGGS | 12183 | 0.56 | 56% |
| GENERAL MILLS | 7189 | 0.33 | 33% |
| POST | 2478 | 0.11 | 11% |
Comments:
- Confirms Kelloggs had the most sales and Post had the least sales
- 56% of the observations are Kelloggs brand sales
- 33% of the observations are General Mills brand sales
- 11% of the observations are Post brand sales
flavor <- gm_joined_data %>%
group_by(flavor) %>%
summarise(count = n(), # make calculations to summarize variable and create counting function
proportion = round(sum(count) / nrow(gm_joined_data), 2),# Calculate proportion
percent = sum(count) / nrow(gm_joined_data)) %>%# create percent variable
arrange(-percent) %>% # display table results by percent in descending order
mutate(percent = percent(percent)) # convert to percentage
| flavor | count | proportion | percent |
|---|---|---|---|
| REGULAR | 8816 | 0.40 | 40.35% |
| TOASTED | 7107 | 0.33 | 32.53% |
| FRUIT | 2192 | 0.10 | 10.03% |
| COCOA | 1901 | 0.09 | 8.70% |
| CINNAMON TOAST | 1834 | 0.08 | 8.39% |
Comments:
- Confirms Regular flavor is the highest seller closely followed by Toasted
- 40.35% of the observations are Regular flavored sales
- 32.53% of the observations are Toasted flavored sales
- 10.03% of the observations are Fruit flavored sales
- 8.70% of the observations are Cocoa flavored sales
- 8.39% of the observations are Cinnamon Toast flavored sales
package <- gm_joined_data %>%
group_by(package) %>%
summarise(count = n(), # make calculations to summarize variable and create counting function
proportion = round(sum(count) / nrow(gm_joined_data), 2),# Calculate proportion
percent = sum(count) / nrow(gm_joined_data)) %>%# create percent variable
arrange(-percent) %>% # display table results by percent in descending order
mutate(percent = percent(percent)) # convert to percentage
| package | count | proportion | percent |
|---|---|---|---|
| BOX | 21306 | 0.98 | 98% |
| CUP | 544 | 0.02 | 2% |
Comments:
- Confirms significantly more observations in this data set were packaged in a box
- 98% of the observations are packaged in a box
- 2% of the observations are packaged in a cup
cereal <- gm_joined_data %>%
group_by(cereal) %>%
summarise(count = n(), # make calculations to summarize variable and create counting function
proportion = round(sum(count) / nrow(gm_joined_data), 2),# Calculate proportion
percent = sum(count) / nrow(gm_joined_data)) %>%# create percent variable
arrange(-percent) %>% # display table results by percent in descending order
mutate(percent = percent(percent)) # convert to percentage
| cereal | count | proportion | percent |
|---|---|---|---|
| FROSTED FLAKES | 2295 | 0.11 | 10.503% |
| FROOT LOOPS | 2192 | 0.10 | 10.032% |
| CINNAMON TST CR | 1834 | 0.08 | 8.394% |
| LUCKY CHARMS | 1681 | 0.08 | 7.693% |
| FROSTED MINI WHEATS | 1574 | 0.07 | 7.204% |
| CHEERIOS | 1458 | 0.07 | 6.673% |
| RICE KRISPIES | 1450 | 0.07 | 6.636% |
| SPECIAL K | 1391 | 0.06 | 6.366% |
| GRAPE NUTS | 1289 | 0.06 | 5.899% |
| RAISIN BRAN | 1266 | 0.06 | 5.794% |
| KIX | 1196 | 0.05 | 5.474% |
| SHREDDED WHEAT | 1189 | 0.05 | 5.442% |
| SMART START | 1134 | 0.05 | 5.190% |
| COCOA PUFFS | 1020 | 0.05 | 4.668% |
| COCOA KRISPIES | 881 | 0.04 | 4.032% |
Comments:
- distribution seems fairly balanced over cereal types
- there may be a low number of Cocoa Krispies when split up
The Quantitative variables in this data set are units, price, and volume
To better understand the Quantitative variables we will look at the following elements:
central tendency
variation
skewness
units <- gm_joined_data %>%
summarise(mean = mean(units), # calculate mean
median = median(units), # calculate median
max = max(units), # calculate max
min = min(units), # calculate min
standard_deviation = sd(units), # calculate sd
skew = skewness(units)) # calculate skew
| mean | median | max | min | standard_deviation | skew |
|---|---|---|---|---|---|
| 8.578947 | 7 | 28 | 1 | 6.70199 | 0.9981163 |
Comments:
skewness of units is 1
mean (8.5789474) is greater than median (7)
median will be a better measure of central tendency.
range (max - min) of price = 28 - 1 = 27
standard deviation of price = 6.7
price <- gm_joined_data %>%
summarise(mean = mean(price), # calculate mean
median = median(price), # calculate median
max = max(price), # calculate max
min = min(price), # calculate min
standard_deviation = sd(price), # calculate sd
skew = skewness(price)) # calculate skew
| mean | median | max | min | standard_deviation | skew |
|---|---|---|---|---|---|
| 3.763466 | 3.79 | 9.99 | 0.25 | 0.9971157 | -0.22296 |
Comments:
skewness of price is -0.22, small skew
mean ($3.76) is less than median ($3.79)
range (max - min) of price = $9.99 - $0.25 = $9.74
standard deviation of price = $1.00
volume <- gm_joined_data %>%
summarise(mean = mean(volume), # calculate mean
median = median(volume), # calculate median
max = max(volume), # calculate max
min = min(volume), # calculate min
standard_deviation = sd(volume), # calculate sd
skew = skewness(volume)) # calculate skew
| mean | median | max | min | standard_deviation | skew |
|---|---|---|---|---|---|
| 1.01561 | 1.06 | 4 | 0.04 | 0.3703431 | 0.540629 |
Comments:
- skewness of volume is 0.54, small skew
- mean (1.02) is less than median (1.06)
- median will be a better measure of central tendency.
- range (max - min) of price = 4 - 0.04 = 3.96
- standard deviation of price = 0.37
Store (iri_key) - distribution of observations per store looks fairly even
Week - distribution of observations appears to be fairly even across all weeks
Promo - significantly more observations were recorded when there was not an in store promotion than when there was a promotion
Ad - significantly more observations were recorded when there was not an advertisements
Brand - Kelloggs had the most sales and Post had the least sales
Flavor - Regular flavor is the highest seller closely followed by Toasted
Cereal - distribution seems to be fairly balanced over cereal type, but there may be a low number of Cocoa Krispies when split up
Package - significantly more observations in this data set were packaged in a box
Units - positive skew
Price - slight negative skew
Volume - slight positive skew
# set up graph showing count for categories
ggplot(data = gm_joined_data, mapping = aes(x = iri_key)) +
geom_bar()
Comments:
- this is not a very clear visual, might want to identify specific stores for further analysis
# set up graph showing count for categories
ggplot(data = gm_joined_data, mapping = aes(x = factor(week))) +
geom_bar()
Comments:
- confirms even distribution of observations over each week
# set up graph showing count for categories
ggplot(data = gm_joined_data, mapping = aes(x = factor(promo))) +
geom_bar()
Comments:
- confirms more sales were made without an in store promotion
- is this representative of the typical distribution?
# set up graph showing count for categories
ggplot(data = gm_joined_data, mapping = aes(x = factor(ad))) +
geom_bar()
Comments:
- confirms the most sales were made without an advertisement
- is this representative of the typical distribution?
# set up graph showing count for categories
ggplot(data = gm_joined_data, mapping = aes(x = factor(brand))) +
geom_bar()
Comments:
- confirms Kelloggs had the most sales and Post had the least sales
# set up graph showing count for categories
ggplot(data = gm_joined_data, mapping = aes(x = factor(flavor))) +
geom_bar()
Comments:
- confirms Regular and Toasted had the most sales
- Cinnamon Toast had the least sales
# set up graph showing count for categories
ggplot(data = gm_joined_data, mapping = aes(x = factor(package))) +
geom_bar()
Comments:
- confirms box Packaging had the most sales
- is this representative of a typical distribution?
# set up graph showing count for categories
ggplot(data = gm_joined_data, mapping = aes(x = factor(cereal))) +
geom_bar()
Comments:
- there appears to be a good number observations for each volume category
- Uni-variate graphical EDA for quantitative variables will be histogram plots, box plots and density plots
# Create histogram
units_histogram <- ggplot(data = gm_joined_data, mapping = aes(x = units)) +
geom_histogram(bins = 10)
# Create box plot
units_boxplot <- ggplot(data = gm_joined_data, mapping = aes(x = 1)) +
geom_boxplot(mapping = aes(y = units))
# create density plot
units_density <- ggplot(data = gm_joined_data, aes(x = units)) +
geom_line(stat = "density", color = "red")
# output
units_boxplot + (units_histogram / units_density)
- Confirms skewed distribution
- will use median rather than mean
- observations with more than 30 units appear to be outliers, but they should be kept in the data set
# Create histogram
price_histogram <- ggplot(data = gm_joined_data, mapping = aes(x = price)) +
geom_histogram(bins = 10)
# Create boxplot
price_boxplot <- ggplot(data = gm_joined_data, mapping = aes(x = 1)) +
geom_boxplot(mapping = aes(y = price))
# create density plot
price_density <- ggplot(data = gm_joined_data, aes(x = price)) +
geom_line(stat = "density", color = "red")
# output
price_boxplot + (price_histogram / price_density)
- Distribution appears to be normal
- will use mean
- price greater than 7.5 might appear to be outliers, but they are important and shouldn't be removed
# Create histogram
volume_histogram <- ggplot(data = gm_joined_data, mapping = aes(x = volume)) +
geom_histogram(bins = 10)
# Create boxplot
volume_boxplot <- ggplot(data = gm_joined_data, mapping = aes(x = 1)) +
geom_boxplot(mapping = aes(y = volume))
# create density plot
volume_density <- ggplot(data = gm_joined_data, aes(x = volume)) +
geom_line(stat = "density", color = "red")
# output
volume_boxplot + (volume_histogram / volume_density)
- Confirms slightly skewed distribution
- will use median rather than mean
- volume of 2 or above appears to be outliers, but they are important and shouldn't be removed
Store (iri_key) - not a clear visual, hard to see what’s going on
Week - confirms fairly even number of observations across all weeks
Promo - is this distribution representative of the expected number of sales made with there’s a promo compared to the number of sales when there isn’t a promo?
Ad - is this distribution representative of the expected number of sales made with there’s an advertisement compared to the number of sales when there isn’t an advertisement?
Flavor - Cinnamon Toast had the least sales
Package - is this distribution representative of the expected number of sales made for box packaging compared to cup packaging?
Cereal - there appears to be a good number observations for each volume category. I don’t think there will be an issue with the slightly lower number of Cocoa Krispies observations when split up
Units - use median rather than mean
Price - use mean, distribution appears normal
Volume - use median rather than mean
# creates table of counts
gm_joined_data %>%
tabyl(iri_key, week) %>%
adorn_totals(where = c("row", "col")) %>%
head()
## iri_key 1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 23 24 25 26 27 28 29 3 30
## 200171 0 1 1 0 0 0 0 2 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0
## 200197 1 0 1 0 1 0 0 2 1 0 0 1 1 0 0 1 0 1 1 3 0 1 0 0
## 200272 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 1 0
## 200297 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1 2 0 0 0 0 0
## 200341 0 0 0 1 0 1 0 1 0 0 0 1 2 1 0 1 0 1 0 0 1 0 0 0
## 200379 0 0 0 1 2 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0
## 31 32 33 34 35 36 37 38 39 4 40 41 42 43 44 45 46 47 48 49 5 50 51 52 6 7 8 9
## 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 2 0 0 0 1 0 0 0 0 0 0
## 1 0 0 0 0 1 1 0 0 0 1 0 0 0 2 0 0 1 0 0 2 1 1 0 1 0 0 0
## 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 2 0 2 0 1 1
## 0 1 0 1 0 0 1 0 1 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0
## 1 0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 2 0 0 0 1 0 0 0 0 0 0 0
## 0 0 1 1 0 1 0 1 1 1 0 0 2 1 0 1 0 0 0 0 0 0 0 0 2 0 1 0
## Total
## 15
## 27
## 17
## 14
## 18
## 19
Comments:
- sample size for each group is too small, which means are not going to be able to come up with any reliable findings
# creates table of counts
gm_joined_data %>%
tabyl(iri_key, promo) %>%
adorn_totals(where = c("row", "col")) %>%
head()
## iri_key 0 1 Total
## 200171 10 5 15
## 200197 18 9 27
## 200272 9 8 17
## 200297 7 7 14
## 200341 9 9 18
## 200379 17 2 19
Comments:
- sample size for each group looks small, which means we may not be able to come up with any reliable findings about promotions per store
# creates table of counts
gm_joined_data %>%
tabyl(iri_key, ad) %>%
adorn_totals(where = c("row", "col")) %>%
head()
## iri_key A B NONE Total
## 200171 4 2 9 15
## 200197 4 0 23 27
## 200272 0 0 17 17
## 200297 1 0 13 14
## 200341 0 1 17 18
## 200379 1 0 18 19
Comments:
- largest sample size for each store when there is no advertisement, we may not be able to come up with any reliable findings about advertisements per store
# creates table of counts
gm_joined_data %>%
tabyl(iri_key, brand) %>%
adorn_totals(where = c("row", "col")) %>%
head()
## iri_key GENERAL MILLS KELLOGGS POST Total
## 200171 4 10 1 15
## 200197 10 15 2 27
## 200272 3 10 4 17
## 200297 5 6 3 14
## 200341 6 10 2 18
## 200379 10 7 2 19
Comments:
- sample size for each group looks small, which means we may not be able to come up with any reliable findings about brand sales per store
# creates table of counts
gm_joined_data %>%
tabyl(iri_key, flavor) %>%
adorn_totals(where = c("row", "col")) %>%
head()
## iri_key CINNAMON TOAST COCOA FRUIT REGULAR TOASTED Total
## 200171 2 3 1 3 6 15
## 200197 1 0 4 12 10 27
## 200272 0 0 3 8 6 17
## 200297 4 0 1 3 6 14
## 200341 1 2 1 11 3 18
## 200379 3 1 2 4 9 19
Comments:
- sample size for each group looks small, which means we may not be able to come up with any reliable findings about flavors per store
# creates table of counts
gm_joined_data %>%
tabyl(iri_key, package) %>%
adorn_totals(where = c("row", "col")) %>%
head()
## iri_key BOX CUP Total
## 200171 15 0 15
## 200197 27 0 27
## 200272 17 0 17
## 200297 14 0 14
## 200341 18 0 18
## 200379 19 0 19
Comments:
- sample size for cup packaging across all stores is small, we won't be able to come up with any reliable findings about cup packaging per store
# creates table of counts
gm_joined_data %>%
tabyl(iri_key, cereal) %>%
adorn_totals(where = c("row", "col")) %>%
head()
## iri_key CHEERIOS CINNAMON TST CR COCOA KRISPIES COCOA PUFFS FROOT LOOPS
## 200171 1 2 2 1 1
## 200197 1 1 0 0 4
## 200272 2 0 0 0 3
## 200297 0 4 0 0 1
## 200341 0 1 1 1 1
## 200379 3 3 0 1 2
## FROSTED FLAKES FROSTED MINI WHEATS GRAPE NUTS KIX LUCKY CHARMS RAISIN BRAN
## 1 0 1 0 0 1
## 3 2 1 4 4 1
## 2 0 4 1 0 1
## 0 0 2 0 1 0
## 3 2 1 4 0 0
## 2 0 2 0 3 0
## RICE KRISPIES SHREDDED WHEAT SMART START SPECIAL K Total
## 2 0 1 2 15
## 4 1 0 1 27
## 2 0 2 0 17
## 2 1 3 0 14
## 1 1 1 1 18
## 2 0 1 0 19
Comments:
- sample size for each group looks small, we may not be able to come up with any reliable findings about cereal per store
# creates table of counts
week_promo <- gm_joined_data %>%
tabyl(week, promo) %>%
adorn_totals(where = c("row", "col"))
| week | 0 | 1 | Total |
|---|---|---|---|
| 1 | 315 | 68 | 383 |
| 2 | 320 | 81 | 401 |
| 3 | 322 | 80 | 402 |
| 4 | 318 | 91 | 409 |
| 5 | 346 | 81 | 427 |
| 6 | 343 | 85 | 428 |
| 7 | 323 | 112 | 435 |
| 8 | 286 | 100 | 386 |
| 9 | 298 | 100 | 398 |
| 10 | 300 | 89 | 389 |
| 11 | 345 | 67 | 412 |
| 12 | 372 | 101 | 473 |
| 13 | 339 | 82 | 421 |
| 14 | 326 | 103 | 429 |
| 15 | 316 | 81 | 397 |
| 16 | 323 | 80 | 403 |
| 17 | 329 | 85 | 414 |
| 18 | 333 | 102 | 435 |
| 19 | 332 | 89 | 421 |
| 20 | 355 | 90 | 445 |
| 21 | 297 | 92 | 389 |
| 22 | 379 | 96 | 475 |
| 23 | 312 | 87 | 399 |
| 24 | 320 | 105 | 425 |
| 25 | 335 | 88 | 423 |
| 26 | 320 | 100 | 420 |
| 27 | 350 | 82 | 432 |
| 28 | 342 | 82 | 424 |
| 29 | 364 | 85 | 449 |
| 30 | 358 | 81 | 439 |
| 31 | 320 | 89 | 409 |
| 32 | 336 | 103 | 439 |
| 33 | 303 | 90 | 393 |
| 34 | 291 | 103 | 394 |
| 35 | 303 | 96 | 399 |
| 36 | 342 | 94 | 436 |
| 37 | 387 | 77 | 464 |
| 38 | 365 | 76 | 441 |
| 39 | 345 | 69 | 414 |
| 40 | 340 | 101 | 441 |
| 41 | 343 | 69 | 412 |
| 42 | 334 | 84 | 418 |
| 43 | 343 | 102 | 445 |
| 44 | 353 | 88 | 441 |
| 45 | 326 | 113 | 439 |
| 46 | 326 | 94 | 420 |
| 47 | 344 | 87 | 431 |
| 48 | 363 | 82 | 445 |
| 49 | 372 | 57 | 429 |
| 50 | 295 | 74 | 369 |
| 51 | 339 | 61 | 400 |
| 52 | 317 | 71 | 388 |
| Total | 17305 | 4545 | 21850 |
Comments:
- sample size for each group looks good
- every week has more observations when there is no in store promotion
# creates table of counts
week_ad <- gm_joined_data %>%
tabyl(week, ad) %>%
adorn_totals(where = c("row", "col"))
small_sample_a <- week_ad %>%
filter(A < 20)
small_sample_b <- week_ad %>%
filter(B < 20)
| week | A | B | NONE | Total |
|---|---|---|---|---|
| 1 | 25 | 16 | 342 | 383 |
| 2 | 38 | 15 | 348 | 401 |
| 3 | 36 | 14 | 352 | 402 |
| 4 | 27 | 30 | 352 | 409 |
| 5 | 12 | 17 | 398 | 427 |
| 6 | 37 | 40 | 351 | 428 |
| 7 | 62 | 31 | 342 | 435 |
| 8 | 53 | 16 | 317 | 386 |
| 9 | 33 | 30 | 335 | 398 |
| 10 | 48 | 28 | 313 | 389 |
| 11 | 30 | 18 | 364 | 412 |
| 12 | 42 | 35 | 396 | 473 |
| 13 | 36 | 11 | 374 | 421 |
| 14 | 31 | 17 | 381 | 429 |
| 15 | 13 | 39 | 345 | 397 |
| 16 | 35 | 12 | 356 | 403 |
| 17 | 24 | 39 | 351 | 414 |
| 18 | 23 | 23 | 389 | 435 |
| 19 | 28 | 28 | 365 | 421 |
| 20 | 20 | 19 | 406 | 445 |
| 21 | 29 | 36 | 324 | 389 |
| 22 | 28 | 16 | 431 | 475 |
| 23 | 36 | 13 | 350 | 399 |
| 24 | 25 | 16 | 384 | 425 |
| 25 | 39 | 21 | 363 | 423 |
| 26 | 43 | 24 | 353 | 420 |
| 27 | 24 | 29 | 379 | 432 |
| 28 | 24 | 21 | 379 | 424 |
| 29 | 34 | 23 | 392 | 449 |
| 30 | 31 | 20 | 388 | 439 |
| 31 | 28 | 22 | 359 | 409 |
| 32 | 29 | 25 | 385 | 439 |
| 33 | 25 | 21 | 347 | 393 |
| 34 | 21 | 16 | 357 | 394 |
| 35 | 41 | 14 | 344 | 399 |
| 36 | 23 | 19 | 394 | 436 |
| 37 | 31 | 19 | 414 | 464 |
| 38 | 24 | 12 | 405 | 441 |
| 39 | 4 | 14 | 396 | 414 |
| 40 | 25 | 24 | 392 | 441 |
| 41 | 24 | 14 | 374 | 412 |
| 42 | 18 | 12 | 388 | 418 |
| 43 | 15 | 15 | 415 | 445 |
| 44 | 34 | 18 | 389 | 441 |
| 45 | 36 | 17 | 386 | 439 |
| 46 | 18 | 19 | 383 | 420 |
| 47 | 7 | 20 | 404 | 431 |
| 48 | 11 | 25 | 409 | 445 |
| 49 | 16 | 8 | 405 | 429 |
| 50 | 19 | 17 | 333 | 369 |
| 51 | 21 | 7 | 372 | 400 |
| 52 | 20 | 6 | 362 | 388 |
| Total | 1456 | 1061 | 19333 | 21850 |
Comments:
- Medium Advertisements (A) have small sample sizes for the following weeks:
| week | A | B | NONE | Total |
|---|---|---|---|---|
| 5 | 12 | 17 | 398 | 427 |
| 15 | 13 | 39 | 345 | 397 |
| 39 | 4 | 14 | 396 | 414 |
| 42 | 18 | 12 | 388 | 418 |
| 43 | 15 | 15 | 415 | 445 |
| 46 | 18 | 19 | 383 | 420 |
| 47 | 7 | 20 | 404 | 431 |
| 48 | 11 | 25 | 409 | 445 |
| 49 | 16 | 8 | 405 | 429 |
| 50 | 19 | 17 | 333 | 369 |
- Small Advertisements (B) have small sample sizes for the following weeks:
| week | A | B | NONE | Total |
|---|---|---|---|---|
| 1 | 25 | 16 | 342 | 383 |
| 2 | 38 | 15 | 348 | 401 |
| 3 | 36 | 14 | 352 | 402 |
| 5 | 12 | 17 | 398 | 427 |
| 8 | 53 | 16 | 317 | 386 |
| 11 | 30 | 18 | 364 | 412 |
| 13 | 36 | 11 | 374 | 421 |
| 14 | 31 | 17 | 381 | 429 |
| 16 | 35 | 12 | 356 | 403 |
| 20 | 20 | 19 | 406 | 445 |
| 22 | 28 | 16 | 431 | 475 |
| 23 | 36 | 13 | 350 | 399 |
| 24 | 25 | 16 | 384 | 425 |
| 34 | 21 | 16 | 357 | 394 |
| 35 | 41 | 14 | 344 | 399 |
| 36 | 23 | 19 | 394 | 436 |
| 37 | 31 | 19 | 414 | 464 |
| 38 | 24 | 12 | 405 | 441 |
| 39 | 4 | 14 | 396 | 414 |
| 41 | 24 | 14 | 374 | 412 |
| 42 | 18 | 12 | 388 | 418 |
| 43 | 15 | 15 | 415 | 445 |
| 44 | 34 | 18 | 389 | 441 |
| 45 | 36 | 17 | 386 | 439 |
| 46 | 18 | 19 | 383 | 420 |
| 49 | 16 | 8 | 405 | 429 |
| 50 | 19 | 17 | 333 | 369 |
| 51 | 21 | 7 | 372 | 400 |
| 52 | 20 | 6 | 362 | 388 |
# creates table of counts
week_brand <- gm_joined_data %>%
tabyl(week, brand) %>%
adorn_totals(where = c("row", "col"))
| week | GENERAL MILLS | KELLOGGS | POST | Total |
|---|---|---|---|---|
| 1 | 134 | 219 | 30 | 383 |
| 2 | 142 | 221 | 38 | 401 |
| 3 | 141 | 208 | 53 | 402 |
| 4 | 137 | 218 | 54 | 409 |
| 5 | 162 | 221 | 44 | 427 |
| 6 | 145 | 231 | 52 | 428 |
| 7 | 136 | 250 | 49 | 435 |
| 8 | 124 | 207 | 55 | 386 |
| 9 | 152 | 185 | 61 | 398 |
| 10 | 148 | 197 | 44 | 389 |
| 11 | 142 | 213 | 57 | 412 |
| 12 | 171 | 245 | 57 | 473 |
| 13 | 143 | 237 | 41 | 421 |
| 14 | 144 | 243 | 42 | 429 |
| 15 | 137 | 219 | 41 | 397 |
| 16 | 127 | 227 | 49 | 403 |
| 17 | 146 | 213 | 55 | 414 |
| 18 | 159 | 223 | 53 | 435 |
| 19 | 130 | 243 | 48 | 421 |
| 20 | 131 | 260 | 54 | 445 |
| 21 | 130 | 208 | 51 | 389 |
| 22 | 154 | 260 | 61 | 475 |
| 23 | 135 | 201 | 63 | 399 |
| 24 | 160 | 216 | 49 | 425 |
| 25 | 125 | 246 | 52 | 423 |
| 26 | 126 | 249 | 45 | 420 |
| 27 | 144 | 230 | 58 | 432 |
| 28 | 137 | 233 | 54 | 424 |
| 29 | 141 | 257 | 51 | 449 |
| 30 | 145 | 233 | 61 | 439 |
| 31 | 116 | 243 | 50 | 409 |
| 32 | 142 | 258 | 39 | 439 |
| 33 | 120 | 238 | 35 | 393 |
| 34 | 100 | 248 | 46 | 394 |
| 35 | 129 | 231 | 39 | 399 |
| 36 | 127 | 269 | 40 | 436 |
| 37 | 134 | 283 | 47 | 464 |
| 38 | 148 | 251 | 42 | 441 |
| 39 | 124 | 250 | 40 | 414 |
| 40 | 135 | 254 | 52 | 441 |
| 41 | 122 | 242 | 48 | 412 |
| 42 | 151 | 242 | 25 | 418 |
| 43 | 156 | 242 | 47 | 445 |
| 44 | 155 | 242 | 44 | 441 |
| 45 | 139 | 253 | 47 | 439 |
| 46 | 147 | 220 | 53 | 420 |
| 47 | 126 | 261 | 44 | 431 |
| 48 | 150 | 251 | 44 | 445 |
| 49 | 125 | 256 | 48 | 429 |
| 50 | 127 | 197 | 45 | 369 |
| 51 | 124 | 225 | 51 | 400 |
| 52 | 144 | 214 | 30 | 388 |
| Total | 7189 | 12183 | 2478 | 21850 |
Comments:
- sample size for each group looks good
# creates table of counts
week_flavor <- gm_joined_data %>%
tabyl(week, flavor) %>%
adorn_totals(where = c("row", "col"))
| week | CINNAMON TOAST | COCOA | FRUIT | REGULAR | TOASTED | Total |
|---|---|---|---|---|---|---|
| 1 | 38 | 24 | 31 | 174 | 116 | 383 |
| 2 | 40 | 33 | 30 | 166 | 132 | 401 |
| 3 | 32 | 37 | 35 | 166 | 132 | 402 |
| 4 | 36 | 39 | 35 | 177 | 122 | 409 |
| 5 | 40 | 35 | 37 | 166 | 149 | 427 |
| 6 | 36 | 29 | 34 | 185 | 144 | 428 |
| 7 | 38 | 29 | 47 | 170 | 151 | 435 |
| 8 | 41 | 30 | 34 | 158 | 123 | 386 |
| 9 | 43 | 31 | 36 | 166 | 122 | 398 |
| 10 | 35 | 37 | 35 | 141 | 141 | 389 |
| 11 | 30 | 35 | 34 | 174 | 139 | 412 |
| 12 | 45 | 43 | 36 | 192 | 157 | 473 |
| 13 | 34 | 40 | 45 | 160 | 142 | 421 |
| 14 | 31 | 37 | 35 | 178 | 148 | 429 |
| 15 | 33 | 38 | 35 | 159 | 132 | 397 |
| 16 | 48 | 34 | 43 | 160 | 118 | 403 |
| 17 | 45 | 27 | 39 | 160 | 143 | 414 |
| 18 | 39 | 34 | 33 | 186 | 143 | 435 |
| 19 | 34 | 32 | 33 | 173 | 149 | 421 |
| 20 | 32 | 37 | 40 | 192 | 144 | 445 |
| 21 | 42 | 32 | 29 | 167 | 119 | 389 |
| 22 | 36 | 28 | 51 | 197 | 163 | 475 |
| 23 | 37 | 22 | 39 | 173 | 128 | 399 |
| 24 | 41 | 36 | 47 | 161 | 140 | 425 |
| 25 | 38 | 35 | 46 | 177 | 127 | 423 |
| 26 | 30 | 34 | 46 | 180 | 130 | 420 |
| 27 | 44 | 29 | 48 | 173 | 138 | 432 |
| 28 | 35 | 48 | 49 | 158 | 134 | 424 |
| 29 | 32 | 46 | 63 | 160 | 148 | 449 |
| 30 | 35 | 47 | 50 | 190 | 117 | 439 |
| 31 | 34 | 41 | 43 | 183 | 108 | 409 |
| 32 | 30 | 51 | 43 | 160 | 155 | 439 |
| 33 | 38 | 48 | 46 | 152 | 109 | 393 |
| 34 | 20 | 32 | 47 | 161 | 134 | 394 |
| 35 | 32 | 41 | 50 | 148 | 128 | 399 |
| 36 | 32 | 40 | 53 | 177 | 134 | 436 |
| 37 | 26 | 34 | 51 | 199 | 154 | 464 |
| 38 | 40 | 40 | 41 | 181 | 139 | 441 |
| 39 | 29 | 45 | 52 | 156 | 132 | 414 |
| 40 | 41 | 38 | 36 | 198 | 128 | 441 |
| 41 | 39 | 46 | 42 | 164 | 121 | 412 |
| 42 | 34 | 36 | 45 | 154 | 149 | 418 |
| 43 | 36 | 45 | 36 | 174 | 154 | 445 |
| 44 | 39 | 39 | 49 | 169 | 145 | 441 |
| 45 | 31 | 36 | 37 | 182 | 153 | 439 |
| 46 | 22 | 38 | 45 | 164 | 151 | 420 |
| 47 | 31 | 47 | 41 | 164 | 148 | 431 |
| 48 | 38 | 40 | 61 | 166 | 140 | 445 |
| 49 | 36 | 38 | 41 | 165 | 149 | 429 |
| 50 | 26 | 26 | 46 | 152 | 119 | 369 |
| 51 | 25 | 30 | 45 | 175 | 125 | 400 |
| 52 | 35 | 32 | 47 | 133 | 141 | 388 |
| Total | 1834 | 1901 | 2192 | 8816 | 7107 | 21850 |
Comments:
- sample size for each group looks good
# creates table of counts
week_package <- gm_joined_data %>%
tabyl(week, package) %>%
adorn_totals(where = c("row", "col"))
| week | BOX | CUP | Total |
|---|---|---|---|
| 1 | 373 | 10 | 383 |
| 2 | 392 | 9 | 401 |
| 3 | 393 | 9 | 402 |
| 4 | 401 | 8 | 409 |
| 5 | 413 | 14 | 427 |
| 6 | 414 | 14 | 428 |
| 7 | 426 | 9 | 435 |
| 8 | 382 | 4 | 386 |
| 9 | 392 | 6 | 398 |
| 10 | 377 | 12 | 389 |
| 11 | 403 | 9 | 412 |
| 12 | 461 | 12 | 473 |
| 13 | 414 | 7 | 421 |
| 14 | 420 | 9 | 429 |
| 15 | 391 | 6 | 397 |
| 16 | 389 | 14 | 403 |
| 17 | 407 | 7 | 414 |
| 18 | 428 | 7 | 435 |
| 19 | 409 | 12 | 421 |
| 20 | 438 | 7 | 445 |
| 21 | 382 | 7 | 389 |
| 22 | 467 | 8 | 475 |
| 23 | 392 | 7 | 399 |
| 24 | 411 | 14 | 425 |
| 25 | 411 | 12 | 423 |
| 26 | 409 | 11 | 420 |
| 27 | 419 | 13 | 432 |
| 28 | 415 | 9 | 424 |
| 29 | 437 | 12 | 449 |
| 30 | 427 | 12 | 439 |
| 31 | 400 | 9 | 409 |
| 32 | 427 | 12 | 439 |
| 33 | 387 | 6 | 393 |
| 34 | 380 | 14 | 394 |
| 35 | 385 | 14 | 399 |
| 36 | 427 | 9 | 436 |
| 37 | 457 | 7 | 464 |
| 38 | 426 | 15 | 441 |
| 39 | 406 | 8 | 414 |
| 40 | 431 | 10 | 441 |
| 41 | 403 | 9 | 412 |
| 42 | 407 | 11 | 418 |
| 43 | 437 | 8 | 445 |
| 44 | 427 | 14 | 441 |
| 45 | 425 | 14 | 439 |
| 46 | 412 | 8 | 420 |
| 47 | 415 | 16 | 431 |
| 48 | 427 | 18 | 445 |
| 49 | 411 | 18 | 429 |
| 50 | 359 | 10 | 369 |
| 51 | 385 | 15 | 400 |
| 52 | 379 | 9 | 388 |
| Total | 21306 | 544 | 21850 |
Comments:
- sample size for cup packaging across weeks is small, we won't be able to come up with any reliable findings about cup packaging per week
# creates table of counts
week_cereal <- gm_joined_data %>%
tabyl(week, cereal) %>%
adorn_totals(where = c("row", "col"))
| week | CHEERIOS | CINNAMON TST CR | COCOA KRISPIES | COCOA PUFFS | FROOT LOOPS | FROSTED FLAKES | FROSTED MINI WHEATS | GRAPE NUTS | KIX | LUCKY CHARMS | RAISIN BRAN | RICE KRISPIES | SHREDDED WHEAT | SMART START | SPECIAL K | Total |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 25 | 38 | 8 | 16 | 31 | 61 | 31 | 18 | 23 | 32 | 29 | 23 | 12 | 15 | 21 | 383 |
| 2 | 28 | 40 | 17 | 16 | 30 | 45 | 36 | 22 | 18 | 40 | 29 | 22 | 16 | 22 | 20 | 401 |
| 3 | 30 | 32 | 14 | 23 | 35 | 45 | 34 | 28 | 19 | 37 | 15 | 22 | 25 | 16 | 27 | 402 |
| 4 | 24 | 36 | 13 | 26 | 35 | 46 | 30 | 31 | 19 | 32 | 28 | 24 | 23 | 20 | 22 | 409 |
| 5 | 37 | 40 | 18 | 17 | 37 | 43 | 23 | 27 | 30 | 38 | 26 | 26 | 17 | 16 | 32 | 427 |
| 6 | 38 | 36 | 7 | 22 | 34 | 58 | 25 | 29 | 19 | 30 | 31 | 29 | 23 | 19 | 28 | 428 |
| 7 | 28 | 38 | 8 | 21 | 47 | 52 | 37 | 27 | 11 | 38 | 21 | 25 | 22 | 33 | 27 | 435 |
| 8 | 18 | 41 | 9 | 21 | 34 | 44 | 22 | 31 | 13 | 31 | 24 | 33 | 24 | 17 | 24 | 386 |
| 9 | 27 | 43 | 8 | 23 | 36 | 32 | 27 | 31 | 24 | 35 | 22 | 22 | 30 | 21 | 17 | 398 |
| 10 | 35 | 35 | 9 | 28 | 35 | 29 | 34 | 25 | 12 | 38 | 22 | 30 | 19 | 15 | 23 | 389 |
| 11 | 26 | 30 | 10 | 25 | 34 | 43 | 28 | 32 | 22 | 39 | 24 | 35 | 25 | 19 | 20 | 412 |
| 12 | 36 | 45 | 18 | 25 | 36 | 46 | 31 | 28 | 29 | 36 | 29 | 28 | 29 | 25 | 32 | 473 |
| 13 | 29 | 34 | 15 | 25 | 45 | 45 | 28 | 20 | 25 | 30 | 21 | 29 | 21 | 31 | 23 | 421 |
| 14 | 30 | 31 | 11 | 26 | 35 | 48 | 39 | 26 | 20 | 37 | 28 | 29 | 16 | 26 | 27 | 429 |
| 15 | 29 | 33 | 14 | 24 | 35 | 47 | 29 | 24 | 19 | 32 | 23 | 27 | 17 | 23 | 21 | 397 |
| 16 | 23 | 48 | 14 | 20 | 43 | 41 | 26 | 24 | 16 | 20 | 28 | 31 | 25 | 18 | 26 | 403 |
| 17 | 29 | 45 | 13 | 14 | 39 | 34 | 31 | 26 | 23 | 35 | 17 | 28 | 29 | 24 | 27 | 414 |
| 18 | 28 | 39 | 8 | 26 | 33 | 44 | 37 | 28 | 30 | 36 | 22 | 25 | 25 | 24 | 30 | 435 |
| 19 | 27 | 34 | 13 | 19 | 33 | 53 | 33 | 29 | 19 | 31 | 20 | 34 | 19 | 22 | 35 | 421 |
| 20 | 21 | 32 | 18 | 19 | 40 | 58 | 32 | 27 | 26 | 33 | 22 | 38 | 27 | 19 | 33 | 445 |
| 21 | 22 | 42 | 15 | 17 | 29 | 52 | 26 | 24 | 16 | 33 | 22 | 21 | 27 | 24 | 19 | 389 |
| 22 | 39 | 36 | 15 | 13 | 51 | 47 | 39 | 33 | 28 | 38 | 22 | 21 | 28 | 34 | 31 | 475 |
| 23 | 27 | 37 | 7 | 15 | 39 | 32 | 25 | 26 | 27 | 29 | 26 | 25 | 37 | 22 | 25 | 399 |
| 24 | 33 | 41 | 14 | 22 | 47 | 34 | 35 | 28 | 23 | 41 | 20 | 23 | 21 | 20 | 23 | 425 |
| 25 | 20 | 38 | 17 | 18 | 46 | 36 | 42 | 24 | 19 | 30 | 28 | 30 | 28 | 17 | 30 | 423 |
| 26 | 25 | 30 | 18 | 16 | 46 | 48 | 29 | 29 | 23 | 32 | 35 | 30 | 16 | 23 | 20 | 420 |
| 27 | 26 | 44 | 12 | 17 | 48 | 49 | 21 | 33 | 19 | 38 | 26 | 33 | 25 | 22 | 19 | 432 |
| 28 | 23 | 35 | 23 | 25 | 49 | 40 | 29 | 30 | 22 | 32 | 13 | 22 | 24 | 27 | 30 | 424 |
| 29 | 30 | 32 | 29 | 17 | 63 | 40 | 19 | 24 | 26 | 36 | 23 | 28 | 27 | 24 | 31 | 449 |
| 30 | 27 | 35 | 20 | 27 | 50 | 40 | 31 | 32 | 29 | 27 | 28 | 25 | 29 | 15 | 24 | 439 |
| 31 | 16 | 34 | 27 | 14 | 43 | 45 | 35 | 28 | 26 | 26 | 27 | 19 | 22 | 21 | 26 | 409 |
| 32 | 32 | 30 | 26 | 25 | 43 | 53 | 28 | 21 | 21 | 34 | 18 | 36 | 18 | 20 | 34 | 439 |
| 33 | 22 | 38 | 28 | 20 | 46 | 36 | 39 | 18 | 22 | 18 | 20 | 18 | 17 | 22 | 29 | 393 |
| 34 | 22 | 20 | 21 | 11 | 47 | 39 | 31 | 24 | 25 | 22 | 20 | 30 | 22 | 31 | 29 | 394 |
| 35 | 17 | 32 | 22 | 19 | 50 | 39 | 29 | 17 | 24 | 37 | 15 | 24 | 22 | 24 | 28 | 399 |
| 36 | 28 | 32 | 23 | 17 | 53 | 50 | 34 | 19 | 19 | 31 | 34 | 33 | 21 | 21 | 21 | 436 |
| 37 | 30 | 26 | 19 | 15 | 51 | 48 | 34 | 20 | 32 | 31 | 38 | 29 | 27 | 32 | 32 | 464 |
| 38 | 34 | 40 | 21 | 19 | 41 | 57 | 32 | 15 | 25 | 30 | 25 | 36 | 27 | 17 | 22 | 441 |
| 39 | 28 | 29 | 26 | 19 | 52 | 40 | 30 | 24 | 25 | 23 | 21 | 33 | 16 | 23 | 25 | 414 |
| 40 | 27 | 41 | 22 | 16 | 36 | 52 | 35 | 25 | 25 | 26 | 34 | 26 | 27 | 26 | 23 | 441 |
| 41 | 23 | 39 | 27 | 19 | 42 | 38 | 29 | 23 | 20 | 21 | 29 | 34 | 25 | 15 | 28 | 412 |
| 42 | 29 | 34 | 18 | 18 | 45 | 43 | 29 | 9 | 30 | 40 | 27 | 25 | 16 | 23 | 32 | 418 |
| 43 | 32 | 36 | 21 | 24 | 36 | 47 | 27 | 22 | 32 | 32 | 21 | 34 | 25 | 27 | 29 | 445 |
| 44 | 34 | 39 | 23 | 16 | 49 | 37 | 33 | 25 | 31 | 35 | 24 | 32 | 19 | 17 | 27 | 441 |
| 45 | 42 | 31 | 21 | 15 | 37 | 46 | 30 | 25 | 29 | 22 | 30 | 32 | 22 | 26 | 31 | 439 |
| 46 | 33 | 22 | 15 | 23 | 45 | 39 | 25 | 24 | 25 | 44 | 22 | 30 | 29 | 15 | 29 | 420 |
| 47 | 22 | 31 | 23 | 24 | 41 | 48 | 29 | 17 | 20 | 29 | 23 | 31 | 27 | 20 | 46 | 431 |
| 48 | 31 | 38 | 17 | 23 | 61 | 48 | 31 | 24 | 23 | 35 | 20 | 35 | 20 | 22 | 17 | 445 |
| 49 | 32 | 36 | 25 | 13 | 41 | 40 | 27 | 27 | 19 | 25 | 31 | 31 | 21 | 27 | 34 | 429 |
| 50 | 24 | 26 | 13 | 13 | 46 | 34 | 19 | 24 | 33 | 31 | 21 | 23 | 21 | 15 | 26 | 369 |
| 51 | 27 | 25 | 17 | 13 | 45 | 47 | 31 | 27 | 24 | 35 | 22 | 18 | 24 | 21 | 24 | 400 |
| 52 | 33 | 35 | 11 | 21 | 47 | 37 | 28 | 15 | 17 | 38 | 20 | 23 | 15 | 16 | 32 | 388 |
| Total | 1458 | 1834 | 881 | 1020 | 2192 | 2295 | 1574 | 1289 | 1196 | 1681 | 1266 | 1450 | 1189 | 1134 | 1391 | 21850 |
Comments:
- sample size for Cocoa Krispies sold in the weeks 1, 6, 7, 8, 9, 10, 18, and 23 may be too small
- week 42 might have a small sample size for Grape Nuts
- however, given the number of observations per week for each cereal the small counts for Cocoa Krispies and Grape Nuts cereals might not be an issue
- Split between in store promotions vs no in store promotions in data set is disproportionate
# creates table of counts
promo_ad <- gm_joined_data %>%
tabyl(ad, promo) %>%
adorn_totals(where = c("row", "col"))
| ad | 0 | 1 | Total |
|---|---|---|---|
| A | 683 | 773 | 1456 |
| B | 402 | 659 | 1061 |
| NONE | 16220 | 3113 | 19333 |
| Total | 17305 | 4545 | 21850 |
Comments:
- sample size for each group looks fine
- the peak advertisement category is none for when there is and isn't a promotion
# creates table of counts
promo_brand <- gm_joined_data %>%
tabyl(brand, promo) %>%
adorn_totals(where = c("row", "col"))
| brand | 0 | 1 | Total |
|---|---|---|---|
| GENERAL MILLS | 5909 | 1280 | 7189 |
| KELLOGGS | 9470 | 2713 | 12183 |
| POST | 1926 | 552 | 2478 |
| Total | 17305 | 4545 | 21850 |
Comments:
- sample size for each group looks fine
- Kelloggs sells the most when there is and isn't an in store promotion
# creates table of counts
promo_flavor <- gm_joined_data %>%
tabyl(flavor, promo) %>%
adorn_totals(where = c("row", "col"))
| flavor | 0 | 1 | Total |
|---|---|---|---|
| CINNAMON TOAST | 1537 | 297 | 1834 |
| COCOA | 1416 | 485 | 1901 |
| FRUIT | 1673 | 519 | 2192 |
| REGULAR | 7057 | 1759 | 8816 |
| TOASTED | 5622 | 1485 | 7107 |
| Total | 17305 | 4545 | 21850 |
Comments:
- sample size for each group looks fine, Cinnamon Toast flavored sales when there is an in store promotion might have a small sample size
- regular flavor sells the most when there is and isn't an in store promotion
# creates table of counts
promo_package <- gm_joined_data %>%
tabyl(package, promo) %>%
adorn_totals(where = c("row", "col"))
| package | 0 | 1 | Total |
|---|---|---|---|
| BOX | 16910 | 4396 | 21306 |
| CUP | 395 | 149 | 544 |
| Total | 17305 | 4545 | 21850 |
Comments:
- sample size for each group looks fine, when there is an in store promotion sales with cup packaging might have a small sample size
- box packaging sells the most when there is and isn't an in store promotion
# creates table of counts
promo_cereal <- gm_joined_data %>%
tabyl(cereal, promo) %>%
adorn_totals(where = c("row", "col"))
| cereal | 0 | 1 | Total |
|---|---|---|---|
| CHEERIOS | 1240 | 218 | 1458 |
| CINNAMON TST CR | 1537 | 297 | 1834 |
| COCOA KRISPIES | 647 | 234 | 881 |
| COCOA PUFFS | 769 | 251 | 1020 |
| FROOT LOOPS | 1673 | 519 | 2192 |
| FROSTED FLAKES | 1819 | 476 | 2295 |
| FROSTED MINI WHEATS | 1248 | 326 | 1574 |
| GRAPE NUTS | 1002 | 287 | 1289 |
| KIX | 997 | 199 | 1196 |
| LUCKY CHARMS | 1366 | 315 | 1681 |
| RAISIN BRAN | 1060 | 206 | 1266 |
| RICE KRISPIES | 1162 | 288 | 1450 |
| SHREDDED WHEAT | 924 | 265 | 1189 |
| SMART START | 812 | 322 | 1134 |
| SPECIAL K | 1049 | 342 | 1391 |
| Total | 17305 | 4545 | 21850 |
Comments:
- sample size for each group looks fine, Kix or Raisin Bran sales when there is an in store promotion might have a small sample size
- Frosted Flakes sells the most when there is and isn't an in store promotion
# creates table of counts
ad_brand <- gm_joined_data %>%
tabyl(brand, ad) %>%
adorn_totals(where = c("row", "col"))
| brand | A | B | NONE | Total |
|---|---|---|---|---|
| GENERAL MILLS | 442 | 272 | 6475 | 7189 |
| KELLOGGS | 903 | 664 | 10616 | 12183 |
| POST | 111 | 125 | 2242 | 2478 |
| Total | 1456 | 1061 | 19333 | 21850 |
Comments:
- Post brand sales when there is an advertisement (A or B) might have a small sample size
- Kelloggs sells the most for every advertisement category
# creates table of counts
ad_flavor <- gm_joined_data %>%
tabyl(flavor, ad) %>%
adorn_totals(where = c("row", "col"))
| flavor | A | B | NONE | Total |
|---|---|---|---|---|
| CINNAMON TOAST | 114 | 51 | 1669 | 1834 |
| COCOA | 128 | 91 | 1682 | 1901 |
| FRUIT | 153 | 108 | 1931 | 2192 |
| REGULAR | 516 | 439 | 7861 | 8816 |
| TOASTED | 545 | 372 | 6190 | 7107 |
| Total | 1456 | 1061 | 19333 | 21850 |
Comments:
- Cinnamon Toast and Cocoa flavored sales when there is an advertisement category of B have a small sample size, Fruit might also have a small sample size
- When the advertisement category is A might also have a small sample size for Cinnamon Toast, Cocoa, and Fruit flavored sales
- Regular flavor sells the most when advertisement category is B or None
- Toasted sells the most when advertisement category is A
# creates table of counts
ad_package <- gm_joined_data %>%
tabyl(package, ad) %>%
adorn_totals(where = c("row", "col"))
| package | A | B | NONE | Total |
|---|---|---|---|---|
| BOX | 1444 | 1025 | 18837 | 21306 |
| CUP | 12 | 36 | 496 | 544 |
| Total | 1456 | 1061 | 19333 | 21850 |
Comments:
- Cup packaging has a small sample size when there is an advertisement category of A or B
- Box packaging sells the most for every advertisement category
# creates table of counts
ad_cereal <- gm_joined_data %>%
tabyl(cereal, ad) %>%
adorn_totals(where = c("row", "col"))
| cereal | A | B | NONE | Total |
|---|---|---|---|---|
| CHEERIOS | 79 | 44 | 1335 | 1458 |
| CINNAMON TST CR | 114 | 51 | 1669 | 1834 |
| COCOA KRISPIES | 52 | 37 | 792 | 881 |
| COCOA PUFFS | 76 | 54 | 890 | 1020 |
| FROOT LOOPS | 153 | 108 | 1931 | 2192 |
| FROSTED FLAKES | 141 | 124 | 2030 | 2295 |
| FROSTED MINI WHEATS | 127 | 88 | 1359 | 1574 |
| GRAPE NUTS | 46 | 63 | 1180 | 1289 |
| KIX | 70 | 54 | 1072 | 1196 |
| LUCKY CHARMS | 103 | 69 | 1509 | 1681 |
| RAISIN BRAN | 67 | 47 | 1152 | 1266 |
| RICE KRISPIES | 114 | 88 | 1248 | 1450 |
| SHREDDED WHEAT | 65 | 62 | 1062 | 1189 |
| SMART START | 124 | 90 | 920 | 1134 |
| SPECIAL K | 125 | 82 | 1184 | 1391 |
| Total | 1456 | 1061 | 19333 | 21850 |
Comments:
- Advertisement category B doesn't have a large enough sample size for all cereal categories
- Froot Loops cereal sells the most when the advertisement category is A
- Frosted Flakes appear to sell the most when the advertisement category is B
- Frosted Flakes cereal sells the most when the advertisement category is none, followed closely by Froot Loops
# creates table of counts
brand_flavor <- gm_joined_data %>%
tabyl(flavor, brand) %>%
adorn_totals(where = c("row", "col"))
| flavor | GENERAL MILLS | KELLOGGS | POST | Total |
|---|---|---|---|---|
| CINNAMON TOAST | 1834 | 0 | 0 | 1834 |
| COCOA | 1020 | 881 | 0 | 1901 |
| FRUIT | 0 | 2192 | 0 | 2192 |
| REGULAR | 1203 | 5135 | 2478 | 8816 |
| TOASTED | 3132 | 3975 | 0 | 7107 |
| Total | 7189 | 12183 | 2478 | 21850 |
Comments:
- When the brand is General Mills, there is a small sample size for Fruit flavored sales
- Toasted flavor sold the most
- When the brand is Kelloggs, there is a small sample size for Cinnamon Toast flavored sales and potentially a small sample for Cocoa flavored sales
- Regular flavor sold the most
- When the brand is Post the only flavor sold was Regular
# creates table of counts
brand_package <- gm_joined_data %>%
tabyl(package, brand) %>%
adorn_totals(where = c("row", "col"))
| package | GENERAL MILLS | KELLOGGS | POST | Total |
|---|---|---|---|---|
| BOX | 7025 | 11803 | 2478 | 21306 |
| CUP | 164 | 380 | 0 | 544 |
| Total | 7189 | 12183 | 2478 | 21850 |
Comments:
- When the brand is General Mills, there is a small sample size for Cup packaging
- When the brand is Kelloggs, there is a potentially a small sample for Cup packaging
- When the brand is Post the only packaging sold was Box
# creates table of counts
brand_cereal <- gm_joined_data %>%
tabyl(cereal, brand) %>%
adorn_totals(where = c("row", "col"))
| cereal | GENERAL MILLS | KELLOGGS | POST | Total |
|---|---|---|---|---|
| CHEERIOS | 1458 | 0 | 0 | 1458 |
| CINNAMON TST CR | 1834 | 0 | 0 | 1834 |
| COCOA KRISPIES | 0 | 881 | 0 | 881 |
| COCOA PUFFS | 1020 | 0 | 0 | 1020 |
| FROOT LOOPS | 0 | 2192 | 0 | 2192 |
| FROSTED FLAKES | 0 | 2295 | 0 | 2295 |
| FROSTED MINI WHEATS | 0 | 1574 | 0 | 1574 |
| GRAPE NUTS | 0 | 0 | 1289 | 1289 |
| KIX | 1196 | 0 | 0 | 1196 |
| LUCKY CHARMS | 1681 | 0 | 0 | 1681 |
| RAISIN BRAN | 0 | 1266 | 0 | 1266 |
| RICE KRISPIES | 0 | 1450 | 0 | 1450 |
| SHREDDED WHEAT | 0 | 0 | 1189 | 1189 |
| SMART START | 0 | 1134 | 0 | 1134 |
| SPECIAL K | 0 | 1391 | 0 | 1391 |
| Total | 7189 | 12183 | 2478 | 21850 |
Comments:
- When the brand is General Mills, the only cereals sold are: Cheerios, Cinnamon TST CR, Cocoa Puffs, Kix, and Lucky Charms
- Cinnamon TST CR sold the most
- Cocoa Puffs sold the least
- When the brand is Kelloggs, the only cereals sold are: Cocoa Krispies, Froot Loops, Frosted Flakes, Frosted Mini Wheats, Rasin Bran, Rice Krispies, Smart Start, and Special K
- Frosted Flakes sold the most
- Cocoa Krispies sold the least
- When the brand is Post, the only cereals sold are: Grape Nuts and Shredded Wheats
- Grape Nuts sold the most
- Shredded Wheat sold the least
# creates table of counts
flavor_package <- gm_joined_data %>%
tabyl(package, flavor) %>%
adorn_totals(where = c("row", "col"))
| package | CINNAMON TOAST | COCOA | FRUIT | REGULAR | TOASTED | Total |
|---|---|---|---|---|---|---|
| BOX | 1774 | 1897 | 2099 | 8644 | 6892 | 21306 |
| CUP | 60 | 4 | 93 | 172 | 215 | 544 |
| Total | 1834 | 1901 | 2192 | 8816 | 7107 | 21850 |
Comments:
- Box sold the most for each flavor
- When the packaging is Cup the flavors of Cinnamon Toast, Cocoa, Fruit, and Regular have small sample sizes
# creates table of counts
flavor_cereal <- gm_joined_data %>%
tabyl(cereal, flavor) %>%
adorn_totals(where = c("row", "col"))
| cereal | CINNAMON TOAST | COCOA | FRUIT | REGULAR | TOASTED | Total |
|---|---|---|---|---|---|---|
| CHEERIOS | 0 | 0 | 0 | 4 | 1454 | 1458 |
| CINNAMON TST CR | 1834 | 0 | 0 | 0 | 0 | 1834 |
| COCOA KRISPIES | 0 | 881 | 0 | 0 | 0 | 881 |
| COCOA PUFFS | 0 | 1020 | 0 | 0 | 0 | 1020 |
| FROOT LOOPS | 0 | 0 | 2192 | 0 | 0 | 2192 |
| FROSTED FLAKES | 0 | 0 | 0 | 2295 | 0 | 2295 |
| FROSTED MINI WHEATS | 0 | 0 | 0 | 1574 | 0 | 1574 |
| GRAPE NUTS | 0 | 0 | 0 | 1289 | 0 | 1289 |
| KIX | 0 | 0 | 0 | 1196 | 0 | 1196 |
| LUCKY CHARMS | 0 | 0 | 0 | 3 | 1678 | 1681 |
| RAISIN BRAN | 0 | 0 | 0 | 1266 | 0 | 1266 |
| RICE KRISPIES | 0 | 0 | 0 | 0 | 1450 | 1450 |
| SHREDDED WHEAT | 0 | 0 | 0 | 1189 | 0 | 1189 |
| SMART START | 0 | 0 | 0 | 0 | 1134 | 1134 |
| SPECIAL K | 0 | 0 | 0 | 0 | 1391 | 1391 |
| Total | 1834 | 1901 | 2192 | 8816 | 7107 | 21850 |
Comments:
- When the flavor is Cinnamon Toast, Cinnamon TST CR is the only cereal sold
- When the flavor is Cocoa, Cinnamon Cocoa Krispies and Coco Puffs are the only cereal sold
- Cocoa Puffs sold the most
- Cocoa Krispies sold the least
- When the flavor is Fruit, Froot Loops is the only cereal sold
- When the flavor is Regular, the only cereals sold are: Cheerios, Frosted Flakes, Frosted Mini Wheats, Grape Nuts, Kix, Lucky Charms, Rasin Bran, and Shredded Wheat
- Frosted Flakes sold the most
- Lucky Charms sold the least
- When the flavor is Toasted, the only cereals sold are: Cheerios, Lucky Charms, Rice Krispies, Smart Start, Special K
- Frosted Flakes sold the most
- Smart Start sold the least
# creates table of counts
package_cerial <- gm_joined_data %>%
tabyl(cereal, package) %>%
adorn_totals(where = c("row", "col"))
| cereal | BOX | CUP | Total |
|---|---|---|---|
| CHEERIOS | 1411 | 47 | 1458 |
| CINNAMON TST CR | 1774 | 60 | 1834 |
| COCOA KRISPIES | 877 | 4 | 881 |
| COCOA PUFFS | 1020 | 0 | 1020 |
| FROOT LOOPS | 2099 | 93 | 2192 |
| FROSTED FLAKES | 2158 | 137 | 2295 |
| FROSTED MINI WHEATS | 1539 | 35 | 1574 |
| GRAPE NUTS | 1289 | 0 | 1289 |
| KIX | 1196 | 0 | 1196 |
| LUCKY CHARMS | 1624 | 57 | 1681 |
| RAISIN BRAN | 1266 | 0 | 1266 |
| RICE KRISPIES | 1425 | 25 | 1450 |
| SHREDDED WHEAT | 1189 | 0 | 1189 |
| SMART START | 1134 | 0 | 1134 |
| SPECIAL K | 1305 | 86 | 1391 |
| Total | 21306 | 544 | 21850 |
Comments:
- Box sold the most for each flavor
- When the packaging is Cup, no category of cereal has enough observations
# create correlation matrix
correlation_matrix <- gm_joined_data %>%
mutate(week = factor(week)) %>%
select(-iri_key) %>%
select_if(is.numeric) %>%
cor() %>%
round(2)
| units | price | volume | |
|---|---|---|---|
| units | 1.00 | -0.19 | 0.02 |
| price | -0.19 | 1.00 | 0.54 |
| volume | 0.02 | 0.54 | 1.00 |
What’s going on with the four observations for cheerios and three observations for lucky charms that are listed as Regular flavor rather than Toasted?
why is there a negative correlation coefficient between price and units?
- visual of store (iri_key) isn't clear as a bar graph, so I'm not sure looking into it again here would really show us much
# build mosiac plot
gm_joined_data %>%
group_by(iri_key, promo) %>%
summarise(count = n()) %>%
ggplot(aes(iri_key, promo)) +
geom_tile(aes(fill = count))
## group_by: 2 grouping variables (iri_key, promo)
## summarise: now 2,656 rows and 3 columns, one group variable remaining (iri_key)
# build bar graph with counts for multiple categorical variables
gm_joined_data %>%
ggplot(mapping = aes(x = iri_key, fill = promo)) +
geom_bar(position = "dodge") +
coord_flip()
Comments:
- these plots confirm that visualizing iri_key isn't very telling
# build bar graph with counts for multiple categorical variables
gm_joined_data %>%
ggplot(mapping = aes(x = factor(week), fill = promo)) +
geom_bar(position = "dodge") +
coord_flip()
Comments:
- confirms there are more observations for non-promotion weeks than when there was an in store promotion in this data set
# build bar graph with counts for multiple categorical variables
gm_joined_data %>%
ggplot(mapping = aes(x = factor(week), fill = ad)) +
geom_bar(position = "dodge") +
coord_flip()
Comments:
- when looking at advertisements over weeks we may want to split them out from when there is not an advertisement in order to evaluate performance of advertisement A against advertisement B
# build bar graph with counts for when there is an advertisement only - first half of the weeks
gm_joined_data %>%
filter(ad %in% c("A", "B"), week %in% c(1:26)) %>%
ggplot(mapping = aes(x = factor(week), fill = ad)) +
geom_bar(position = "dodge") +
coord_flip()
# second half of the weeks
gm_joined_data %>%
filter(ad %in% c("A", "B"), week %in% c(27:52)) %>%
ggplot(mapping = aes(x = factor(week), fill = ad)) +
geom_bar(position = "dodge") +
coord_flip()
## filter: removed 20,790 rows (95%), 1,060 rows remaining
Comments:
- week 39 (typically around mid to late September) has a low count of medium sized advertisements
- there appears to be more medium advertisements (A) than small advertisements (B) per week, but that isn't consistent across all weeks
# build bar graph with counts for multiple categorical variables
gm_joined_data %>%
ggplot(mapping = aes(x = factor(week), fill = brand)) +
geom_bar(position = "dodge") +
coord_flip()
Comments:
- confirms across all weeks there are more Kelloggs brand than GM brand and more GM than Post brand observations in the data
# build bar graph with counts for first 26 weeks
gm_joined_data %>%
filter( week %in% c(1:26)) %>%
ggplot(mapping = aes(x = factor(week), fill = flavor)) +
geom_bar(position = "dodge") +
coord_flip()
# build bar graph with counts for last 26 weeks
gm_joined_data %>%
filter( week %in% c(27:52)) %>%
ggplot(mapping = aes(x = factor(week), fill = flavor)) +
geom_bar(position = "dodge") +
coord_flip()
Comments:
- confirms Regular followed by Toasted flavors typically have the most observations each week
- in week 52 toasted has more observations more than regular
- in week 10 toasted and regular seem to have the same number of observations
# build bar graph with counts
gm_joined_data %>%
ggplot(mapping = aes(x = factor(week), fill = package)) +
geom_bar(position = "dodge") +
coord_flip()
Comments:
- confirms box packaging sold the most each week
# build mosaic plot first 26 weeks
gm_joined_data %>%
filter(week %in% c(1:26)) %>%
group_by(week, cereal) %>%
summarise(count = n()) %>%
ggplot(aes(factor(week), cereal)) +
geom_tile(aes(fill = count))
# build mosaic plot las 26 weeks
gm_joined_data %>%
filter(week %in% c(27:52)) %>%
group_by(week, cereal) %>%
summarise(count = n()) %>%
ggplot(aes(factor(week), cereal)) +
geom_tile(aes(fill = count))
Comments:
- confirms box frosted flakes and froot loops typically had the most observations
- in the first 26 weeks of the year, Cocoa Krispies typically had the least observations
- in the last 26 weeks it looks like Cocoa Puffs typically had the least sales
# build bar graph with counts for multiple categorical variables
gm_joined_data %>%
ggplot(mapping = aes(x = ad, fill = promo)) +
geom_bar(position = "dodge") +
coord_flip()
Comments:
- when you only look at the promotions for ad A compared with ad B there appears to be a similar pattern of promotions. It might be interesting to look at this further over weeks or stores
# build bar graph with counts when ad is A or B
gm_joined_data %>%
filter(ad != "NONE") %>%
ggplot(mapping = aes(x = ad, fill = promo)) +
geom_bar(position = "dodge") +
coord_flip()
# build mosaic plot with promotion and either Ad A or Ad B
gm_joined_data %>%
filter(ad != "NONE") %>%
group_by(ad, promo) %>%
summarise(count = n()) %>%
ggplot(aes(ad, promo)) +
geom_tile(aes(fill = count))
Comments:
- interesting to see that there are more observations of ad B and A when there is an also in store promotion being ran than when there isn't
# observation counts for promotions over brands
gm_joined_data %>%
ggplot(mapping = aes(x = brand, fill = promo)) +
geom_bar(position = "dodge") +
coord_flip()
Comments:
- confirms there are more observations for each brand when there isn't an in store promotion
# build bar graph with counts for promotions over flavors
gm_joined_data %>%
ggplot(mapping = aes(x = flavor, fill = promo)) +
geom_bar(position = "dodge") +
coord_flip()
Comments:
- confirms there are more observations for each flavor when there isn't an in store promotion
# build bar graph with counts for promotions over package types
gm_joined_data %>%
ggplot(mapping = aes(x = package, fill = promo)) +
geom_bar(position = "dodge") +
coord_flip()
Comments:
- confirms there are more observations for each package type when there isn't an in store promotion
# build bar graph with counts for promo over cereal types
gm_joined_data %>%
ggplot(mapping = aes(x = cereal, fill = promo)) +
geom_bar(position = "dodge") +
coord_flip()
Comments:
- confirms there are more observations for each cereal when there isn't an in store promotion
# build bar graph with counts for ad caegories over brands
gm_joined_data %>%
ggplot(mapping = aes(x = brand, fill = ad)) +
geom_bar(position = "dodge") +
coord_flip()
Comments:
- confirms there are more observations for each brand when there isn't an advertisement
# build bar graph with counts for ad categories A & B over brands
gm_joined_data %>%
filter(ad != "NONE") %>%
ggplot(mapping = aes(x = brand, fill = ad)) +
geom_bar(position = "dodge") +
coord_flip()
Comments:
- Post has more observations for advertisement B than A when there is an advertisement
- Kelloggs and GM have more observations for advertisement A when there is an ad
# build bar graph with counts for ad categories over flavors
gm_joined_data %>%
ggplot(mapping = aes(x = flavor, fill = ad)) +
geom_bar(position = "dodge") +
coord_flip()
Comments:
- confirms there are more observations for each flavor when there isn't an advertisement
# build bar graph with counts for ad categories A & B over flavors
gm_joined_data %>%
filter(ad != "NONE") %>%
ggplot(mapping = aes(x = flavor, fill = ad)) +
geom_bar(position = "dodge") +
coord_flip()
Comments:
- when there is an advertisement, there are more observations for ad A across all flavors
# build bar graph with counts for ad categories over package types
gm_joined_data %>%
ggplot(mapping = aes(x = package, fill = ad)) +
geom_bar(position = "dodge") +
coord_flip()
Comments:
- confirms there are more observations for each flavor when there isn't an advertisement
# build bar graph with counts for ad categories over cereal types
gm_joined_data %>%
ggplot(mapping = aes(x = cereal, fill = ad)) +
geom_bar(position = "dodge") +
coord_flip()
Comments:
- confirms there are more observations for each cereal when there isn't an advertisement
# build bar graph with counts for ad categories A & B over cereal types
gm_joined_data %>%
filter(ad != "NONE") %>%
ggplot(mapping = aes(x = cereal, fill = ad)) +
geom_bar(position = "dodge") +
coord_flip()
Comments:
- When there is an advertisement, all cereals have more observations for advertisement A except for grape nuts
- Grape nuts have more observations for ad B
# build bar graph with counts for ad categories over cereal types
gm_joined_data %>%
ggplot(mapping = aes(x = flavor, fill = brand)) +
geom_bar(position = "dodge") +
coord_flip()
- confirms all three brands sell regular flavored products and that kelloggs sells the most of that flavor
- confirms Post only sells regular flavored products, GM only sells toasted, regular, cocoa and cinnamon toast flavored products and Kelloggs only sells toasted, regular, cocoa, and cinnamon toast flavored products
- if we're trying to conduct a comparison of flavors by brands, we're going to run into some issues
# build bar graph with counts for ad categories over cereal types
gm_joined_data %>%
ggplot(mapping = aes(x = package, fill = brand)) +
geom_bar(position = "dodge") +
coord_flip()
Comments:
- confirms all three brands have more obsercations for boxed products
# build bar graph with counts for ad categories over cereal types
gm_joined_data %>%
ggplot(mapping = aes(x = cereal, fill = brand)) +
geom_bar(position = "dodge") +
coord_flip()
Comments:
- confirms kelloggs' frosted flakes and froot loops cereals have the most observations
# build bar graph with counts for ad categories over cereal types
gm_joined_data %>%
ggplot(mapping = aes(x = flavor, fill = package)) +
geom_bar(position = "dodge") +
coord_flip()
Comments:
- confirms for each flavor, boxed packaging has the most observations
# build bar graph with counts for ad categories over cereal types
gm_joined_data %>%
ggplot(mapping = aes(x = cereal, fill = flavor)) +
geom_bar(position = "dodge") +
coord_flip()
Comments:
- confirms something is wrong with Lucky Charms and Cheerio observations recorded as regular flavor (should be toasted)
# build bar graph with counts for ad categories over cereal types
gm_joined_data %>%
ggplot(mapping = aes(x = cereal, fill = package)) +
geom_bar(position = "dodge") +
coord_flip()
Comments:
- confirms most observations for each cereal were packaged in a box
# set up colors
fill_color <- "#4271AE"
line_color <- "#1F3552"
- negative correlation coefficient between units and price
# create box plot
units_price_box <- gm_joined_data %>%
group_by(units) %>%
ggplot(mapping = aes(x = units, group = units, y = price)) +
geom_boxplot(fill = fill_color, colour = line_color)
# create scatter plot
units_price_scatter <- gm_joined_data %>%
ggplot(mapping = aes(x = units, y = price)) +
geom_point() +
geom_smooth(method = "lm")
# output
units_price_box + units_price_scatter
Comments:
- confirms there is some kind of negative relationship between units and price
- need to look at this over cereal types
- slight positive correlation coefficient between units and volume
# create box plot
units_vol_box <- gm_joined_data %>%
group_by(units) %>%
ggplot(mapping = aes(x = units, group = units, y = volume)) +
geom_boxplot(fill = fill_color, colour = line_color)
# create scatter plot
units_vol_scatter <- gm_joined_data %>%
ggplot(mapping = aes(x = units, y = volume)) +
geom_point() +
geom_smooth(method = "lm")
# output
units_vol_box + units_vol_scatter
Comments:
- confirms there is some kind of positive relationship between units and volume, but this looks odd
- average volume is not consistently increasing with units in a linear manner
- need to look at this over cereal types
- slight positive correlation coefficient between units and volume
# create box plot
price_vol_box <- gm_joined_data %>%
group_by(volume) %>%
ggplot(mapping = aes(x = volume, group = volume, y = price)) +
geom_boxplot(fill = fill_color, colour = line_color)
# create scatter plot
price_vol_scatter <- gm_joined_data %>%
ggplot(mapping = aes(x = volume, y = price)) +
geom_point() +
geom_smooth(method = "lm")
# output
price_vol_box + price_vol_scatter
Comments:
- confirms there is some kind of positive relationship between price and volume
# boxplot of price per brand
price_promo <- gm_total_sales %>%
ggplot(mapping = aes(x = promo, y = price)) +
geom_boxplot(fill = fill_color, colour = line_color)
price_ad <- gm_total_sales %>%
ggplot(mapping = aes(x = ad, y = price)) +
geom_boxplot(fill = fill_color, colour = line_color)
price_brand <- gm_total_sales %>%
ggplot(mapping = aes(x = brand, y = price)) +
geom_boxplot(fill = fill_color, colour = line_color)
price_flavor <- gm_total_sales %>%
ggplot(mapping = aes(x = flavor, y = price)) +
geom_boxplot(fill = fill_color, colour = line_color)
price_package <- gm_total_sales %>%
ggplot(mapping = aes(x = package, y = price)) +
geom_boxplot(fill = fill_color, colour = line_color)
# this graph doesn't look very good, its hard ot read
price_cereal <- gm_total_sales %>%
ggplot(mapping = aes(x = cereal, y = price)) +
geom_boxplot(fill = fill_color, colour = line_color)
(price_promo + price_package) / (price_flavor) /( price_brand + price_ad ) / price_cereal
# boxplot of unit per brand
unit_promo <- gm_total_sales %>%
ggplot(mapping = aes(x = promo)) +
geom_bar(fill = fill_color, colour = line_color)
unit_ad <- gm_total_sales %>%
ggplot(mapping = aes(x = ad)) +
geom_bar(fill = fill_color, colour = line_color)
unit_brand <- gm_total_sales %>%
ggplot(mapping = aes(x = brand)) +
geom_bar(fill = fill_color, colour = line_color)
unit_flavor <- gm_total_sales %>%
ggplot(mapping = aes(x = flavor)) +
geom_bar(fill = fill_color, colour = line_color)
unit_package <- gm_total_sales %>%
ggplot(mapping = aes(x = package)) +
geom_bar(fill = fill_color, colour = line_color)
# this graph doesn't look very good, its hard to read
unit_cereal <- gm_total_sales %>%
ggplot(mapping = aes(x = cereal)) +
geom_bar(fill = fill_color, colour = line_color)
(unit_promo + unit_package) / (unit_flavor) /( unit_brand + unit_ad ) / unit_cereal
# boxplot of volume per brand
volume_promo <- gm_total_sales %>%
ggplot(mapping = aes(x = promo, y = volume)) +
geom_boxplot(fill = fill_color, colour = line_color)
volume_ad <- gm_total_sales %>%
ggplot(mapping = aes(x = ad, y = volume)) +
geom_boxplot(fill = fill_color, colour = line_color)
volume_brand <- gm_total_sales %>%
ggplot(mapping = aes(x = brand, y = volume)) +
geom_boxplot(fill = fill_color, colour = line_color)
volume_flavor <- gm_total_sales %>%
ggplot(mapping = aes(x = flavor, y = volume)) +
geom_boxplot(fill = fill_color, colour = line_color)
volume_package <- gm_total_sales %>%
ggplot(mapping = aes(x = package, y = volume)) +
geom_boxplot(fill = fill_color, colour = line_color)
# this graph doesn't look very good, its hard ot read
volume_cereal <- gm_total_sales %>%
ggplot(mapping = aes(x = cereal, y = volume)) +
geom_boxplot(fill = fill_color, colour = line_color)
(volume_promo + volume_package) / (volume_flavor) /( volume_brand + volume_ad ) / volume_cereal
# Brand and Price
# brand total sales
brand_sales <- gm_total_sales %>%
group_by(brand) %>%
summarise(median_price = median(price),
mean_price = mean(price),
totalunits = n(),
totalsales = sum(price),
totalvolume = sum(volume))
# build bar graph with total sales per brand
brand_price <- brand_sales %>%
ggplot(mapping = aes(x = brand, y = totalsales, fill = brand)) +
geom_col(stat= 'identity', position = "dodge") +
scale_y_continuous(labels=scales::dollar_format()) +
coord_flip()
## brand and units
# build bar graph with total units per brand
brand_units <- brand_sales %>%
ggplot(mapping = aes(x = brand, y = totalunits, fill = brand)) +
geom_col(stat= 'identity', position = "dodge") +
coord_flip()
## brand and volume
# build bar graph with total units per brand
brand_volume <- brand_sales %>%
ggplot(mapping = aes(x = brand, y = totalvolume, fill = brand)) +
geom_col(stat= 'identity', position = "dodge") +
coord_flip()
brand_price / brand_units / brand_volume
# Cereal and Price
# brand total sales
cereal_sales <- gm_total_sales %>%
group_by(brand, cereal) %>%
summarise(median_price = median(price),
mean_price = mean(price),
totalunits = n(),
totalsales = dollar(sum(price)),
totalvolume = sum(volume))
# build bar graph with total sales per cereal
cereal_sales %>%
ggplot(mapping = aes(x = cereal, y = totalsales, fill = brand)) +
geom_col(stat= 'identity', position = "dodge") +
coord_flip()
## cereal and units
# build bar graph with total units per brand
cereal_sales %>%
ggplot(mapping = aes(x = cereal, y = totalunits, fill = brand)) +
geom_col(stat= 'identity', position = "dodge") +
coord_flip()
## cereal and volume
# build bar graph with total units per brand
cereal_sales %>%
ggplot(mapping = aes(x = cereal, y = totalvolume, fill = brand)) +
geom_col(stat= 'identity', position = "dodge") +
coord_flip()
we could look at promotion performance across stores since the promotion is reliant upon the store
Which ad had better performance A or B
need to look at units per cereal and find total sales price/ total sales per cereal
Questions to be answered outside of the data
Are the four observations for cheerios and three observations for lucky charms that are listed as Regular flavor rather than Toasted data entry errors?
Is this data set representative of the marketplace for these three brands?
Can we get actual dates associated with these observations rather than just week numbers recorded in this data set?
Questions that can be answered from the data?
How well do promotions perform? (promotion performance)
How well do advertisements perform? (advertisement performance)
What about overall sales performance for each brand? (overall cereal performance)
What is the advertisement and in store promotion strategy for each brand? (competitive strategy performance)
Why is there a negative correlation coefficient between price and units?
# GM promotional sales Figures in relation to others
gm_total_sales %>%
filter(promo == 1) %>%
ggplot(mapping = aes(x = brand, fill = brand)) +
geom_bar() +
labs(y = "Units Sold")
General Mills sits between Kelloggs and Post in promotional units sold
# Overall sales profile of units and sold and revenue between promotions and non-promotion
gm_cereal_Punits <- gm_total_sales %>%
filter(brand == "GENERAL MILLS", promo == 1) %>%
ggplot(mapping = aes(x = cereal, fill = cereal)) +
geom_bar(show.legend = FALSE) +
coord_flip() +
ggtitle("Promotion") +
labs(y = "Units Sold")
gm_cereal_Runits <- gm_total_sales %>%
filter(brand == "GENERAL MILLS", promo == 0) %>%
ggplot(mapping = aes(x = cereal, fill = cereal)) +
geom_bar(show.legend = FALSE) +
coord_flip() +
ggtitle("No Promotion") +
labs(y = "Units Sold")
gm_cereal_Prev <- gm_total_sales %>%
filter(brand == "GENERAL MILLS", promo == 1) %>%
ggplot(mapping = aes(x = cereal, y = price, fill = cereal)) +
geom_bar(stat = "identity", show.legend = FALSE) +
coord_flip() +
labs(y = "Revenue")
gm_cereal_Rrev <- gm_total_sales %>%
filter(brand == "GENERAL MILLS", promo == 0) %>%
ggplot(mapping = aes(x = cereal, y = price, fill = cereal)) +
geom_bar(stat = "identity", show.legend = FALSE) +
coord_flip() +
labs(y = "Revenue")
(gm_cereal_Punits + gm_cereal_Runits) / (gm_cereal_Prev + gm_cereal_Rrev)
The overall sales profile for General Mills changes during a promotion. when promotions are being ran Lucky Charms moves from the third highest seller to the top seller
# Examine regular, toasted, and cocoa flavor promotion sales across brands
reg_brands_units <- gm_total_sales %>%
filter(flavor == "REGULAR", promo == 1) %>%
ggplot(mapping = aes(x = brand, fill = brand)) +
geom_bar(show.legend = FALSE) +
coord_flip() +
ggtitle("Regular on Promotion") +
labs(y = "Units Sold")
toasted_brands_units <- gm_total_sales %>%
filter(flavor == "TOASTED", promo == 1) %>%
ggplot(mapping = aes(x = brand, fill = brand)) +
geom_bar(show.legend = FALSE) +
coord_flip() +
ggtitle("Toasted on Promotion") +
labs(y = "Units Sold")
cocoa_brands_units <- gm_total_sales %>%
filter(flavor == "COCOA", promo == 1) %>%
ggplot(mapping = aes(x = brand, fill = brand)) +
geom_bar(show.legend = FALSE) +
coord_flip() +
ggtitle("Cocoa on Promotion") +
labs(y = "Units Sold")
reg_brands_units / toasted_brands_units / cocoa_brands_units
General Mills trails units sold in regular flavored cereal to both Post and Kelloggs
# Overall sales profile of units and sold and revenue between promotions and non-promotion based on flavor
gm_flavor_Punits <- gm_total_sales %>%
filter(brand == "GENERAL MILLS", promo == 1) %>%
ggplot(mapping = aes(x = flavor, fill = flavor)) +
geom_bar(show.legend = FALSE) +
coord_flip() +
ggtitle("Promotion") +
labs(y = "Units Sold")
gm_flavor_Runits <- gm_total_sales %>%
filter(brand == "GENERAL MILLS", promo == 0) %>%
ggplot(mapping = aes(x = flavor, fill = flavor)) +
geom_bar(show.legend = FALSE) +
coord_flip() +
ggtitle("No Promotion") +
labs(y = "Units Sold")
gm_flavor_Prev <- gm_total_sales %>%
filter(brand == "GENERAL MILLS", promo == 1) %>%
ggplot(mapping = aes(x = flavor, y = price, fill = flavor)) +
geom_bar(stat = "identity", show.legend = FALSE) +
coord_flip() +
labs(y = "Revenue")
gm_flavor_Rrev <- gm_total_sales %>%
filter(brand == "GENERAL MILLS", promo == 0) %>%
ggplot(mapping = aes(x = flavor, y = price, fill = flavor)) +
geom_bar(stat = "identity", show.legend = FALSE) +
coord_flip() +
labs(y = "Revenue")
(gm_flavor_Punits + gm_flavor_Runits) / (gm_flavor_Prev + gm_flavor_Rrev)
Cocoa flavored cereal is the most impacted by promotions
# GM ad sales Figures in relation to others
gm_total_sales %>%
filter(ad == "A" | ad == "B") %>%
ggplot(mapping = aes(x = brand, fill = ad)) +
geom_bar(position = "dodge") +
labs(y = "Units Sold")
The pattern of General Mills sitting between Kelloggs and Post holds consistent
# Examine ad impacts on GM cereal sales and revenue
gm_cereal_Aunits <- gm_total_sales %>%
filter(brand == "GENERAL MILLS", ad == "A" | ad == "B") %>%
ggplot(mapping = aes(x = cereal, fill = ad)) +
geom_bar(position = "dodge") +
coord_flip() +
ggtitle("Ad") +
labs(y = "Units Sold")
gm_cereal_Arevenue <- gm_total_sales %>%
filter(brand == "GENERAL MILLS", ad == "A" | ad == "B") %>%
group_by(ad, cereal) %>%
summarise(rev = sum(price)) %>%
ggplot(mapping = aes(x = cereal, y = rev, fill = ad)) +
geom_bar(position = "dodge", stat = "identity", show.legend = FALSE) +
coord_flip() +
ggtitle("Ad") +
labs(y = "Revenue") +
scale_y_continuous(labels=scales::dollar_format())
gm_cereal_Aunits / gm_cereal_Arevenue
# Examine ad impacts on GM cereal flavor sales and revenue
gm_flavor_Aunits <- gm_total_sales %>%
filter(brand == "GENERAL MILLS", ad == "A" | ad == "B") %>%
ggplot(mapping = aes(x = flavor, fill = ad)) +
geom_bar(position = "dodge") +
coord_flip() +
ggtitle("Ad") +
labs(y = "Units Sold")
gm_flavor_Arevenue <- gm_total_sales %>%
filter(brand == "GENERAL MILLS", ad == "A" | ad == "B") %>%
group_by(ad, flavor) %>%
summarise(rev = sum(price)) %>%
ggplot(mapping = aes(x = flavor, y = rev, fill = ad)) +
geom_bar(position = "dodge", stat = "identity", show.legend = FALSE) +
coord_flip() +
ggtitle("Ad") +
labs(y = "Revenue") +
scale_y_continuous(labels=scales::dollar_format())
gm_flavor_Aunits / gm_flavor_Arevenue
# Cereal profile
gm_cereal_units <- gm_total_sales %>%
filter(brand == "GENERAL MILLS") %>%
ggplot(mapping = aes(x = cereal, fill = cereal)) +
geom_bar(show.legend = FALSE) +
coord_flip() +
ggtitle("Total Sales") +
labs(y = "Units Sold")
gm_cereal_revenue <- gm_total_sales %>%
filter(brand == "GENERAL MILLS") %>%
ggplot(mapping = aes(x = cereal, y = price, fill = cereal)) +
geom_bar(stat = "Identity", show.legend = FALSE) +
coord_flip() +
ggtitle("") +
labs(y = "Revenue")
gm_cereal_units / gm_cereal_revenue
# Cereal profile
kg_cereal_units <- gm_total_sales %>%
filter(brand == "KELLOGGS") %>%
ggplot(mapping = aes(x = cereal, fill = cereal)) +
geom_bar(show.legend = FALSE) +
coord_flip() +
ggtitle("Total Sales") +
labs(y = "Units Sold")
kg_cereal_revenue <- gm_total_sales %>%
filter(brand == "KELLOGGS") %>%
ggplot(mapping = aes(x = cereal, y = price, fill = cereal)) +
geom_bar(stat = "Identity", show.legend = FALSE) +
coord_flip() +
ggtitle("") +
labs(y = "Revenue")
kg_cereal_units / kg_cereal_revenue
# Cereal profile
pt_cereal_units <- gm_total_sales %>%
filter(brand == "POST") %>%
ggplot(mapping = aes(x = cereal, fill = cereal)) +
geom_bar(show.legend = FALSE) +
coord_flip() +
ggtitle("Total Sales") +
labs(y = "Units Sold")
pt_cereal_revenue <- gm_total_sales %>%
filter(brand == "POST") %>%
ggplot(mapping = aes(x = cereal, y = price, fill = cereal)) +
geom_bar(stat = "Identity", show.legend = FALSE) +
coord_flip() +
ggtitle("") +
labs(y = "Revenue")
pt_cereal_units / pt_cereal_revenue
promotions <- gm_joined_data %>%
filter(promo == 1) %>%
group_by(brand, iri_key) %>%
summarise(number_of_promo_weeks_per_store = n())
promotions <- promotions %>%
group_by(brand) %>%
summarise(number_of_stores_with_promos = n(),
total_promo_weeks_across_stores = sum(number_of_promo_weeks_per_store))
promo_stores <- promotions %>%
ggplot(mapping = aes(x = brand, y = number_of_stores_with_promos, fill = brand )) +
geom_bar(stat = "identity", position = "dodge", show.legend = FALSE) +
labs(title = "Number of Stores with Promotions", y = "Count of Stores", x = "") +
coord_flip()
promo_weeks <- promotions %>%
ggplot(mapping = aes(x = brand, y = total_promo_weeks_across_stores, fill = brand )) +
geom_bar(stat = "identity", position = "dodge", show.legend = FALSE) +
labs(title = "Number of weeks with Promotions", subtitle = "total promo weeks across all stores", y = "Count of Promo Weeks", x = "") +
coord_flip()
promo_stores / promo_weeks
# create customized theme function starting with theme_classic()
clean_theme <- theme_classic() +
theme(legend.direction = "horizontal", # create horizontal legend
legend.position = "bottom", # put legend at bottom of graph
legend.justification='left', # align legend to the left
legend.title = element_blank(), # remove legend title
axis.line.y = element_blank(), # remove y-axis line
axis.ticks.y = element_blank(), # remove y-axis ticks
axis.ticks.x = element_blank(), # remove x-axis ticks
plot.title = element_text(face = "bold", size = 15)) # make graph title bold and a larger font
- set up data
toasted_promo_price_weekly <- gm_total_sales %>%
filter(promo == 1, flavor == "TOASTED") %>%
group_by(brand, week, flavor) %>%
summarise(average_promo_price = median(price)) %>%
select(-flavor)
toasted_average_non_promo_weekly_price<- gm_total_sales %>%
filter(promo == 0, flavor == "TOASTED") %>%
group_by(brand, week, flavor) %>%
summarise(average_non_promo_price = median(price))%>%
select(-flavor)
toasted_average_promo_weekly_units <- gm_joined_data %>%
filter(promo == 1, flavor == "TOASTED") %>%
group_by(brand, week, flavor) %>%
summarise(average_promo_units = median(units))%>%
select(-flavor)
toasted_average_non_promo_weekly_units <- gm_joined_data %>%
filter(promo == 0, flavor == "TOASTED") %>%
group_by(brand, week, flavor) %>%
summarise(average_non_promo_units = median(units))%>%
select(-flavor)
toasted_average_promo_weekly_store_count <- gm_joined_data %>%
filter(promo == 1, flavor == "TOASTED") %>%
group_by(brand, week, flavor) %>%
summarise(promo_store_count = n())%>%
select(-flavor)
toasted_average_no_promo_weekly_store_count <-gm_joined_data %>%
filter(promo == 0, flavor == "TOASTED") %>%
group_by(brand, week, flavor) %>%
summarise(no_promo_store_count = n())%>%
select(-flavor)
toasted_weekly_promo_analysis <- right_join(toasted_promo_price_weekly, toasted_average_non_promo_weekly_price, by = c("brand", "week"))
toasted_weekly_promo_analysis <- right_join(toasted_weekly_promo_analysis, toasted_average_promo_weekly_units, by = c("brand", "week"))
toasted_weekly_promo_analysis <- right_join(toasted_weekly_promo_analysis, toasted_average_non_promo_weekly_units, by = c("brand", "week"))
toasted_weekly_promo_analysis <- right_join(toasted_weekly_promo_analysis, toasted_average_promo_weekly_store_count, by = c("brand", "week"))
toasted_weekly_promo_analysis <- right_join(toasted_weekly_promo_analysis, toasted_average_no_promo_weekly_store_count, by = c("brand", "week"))
- Look at Average Weekly Promotion pricing for each brand for toasted products
#Weekly promo prices by brand for regular products
toasted_promo_weekly_price <- toasted_weekly_promo_analysis %>%
ggplot(mapping= aes(x = week, y = average_promo_price, group = brand, color = brand)) +
geom_smooth(method = "loess", se = FALSE) +
ggtitle(label = "General Mills toasted products are typically priced lower than \nKelloggs when there is a promotion", subtitle = "Average weekly price for toasted flavored products") +
ylab("Average Promo Price") +
xlab("Week") +
clean_theme
toasted_promo_weekly_price
## `geom_smooth()` using formula 'y ~ x'
- Look at average units sold at promotion prices
#Weekly promo prices by brand for toasted products
min_lim <- min(toasted_weekly_promo_analysis$average_promo_price[toasted_weekly_promo_analysis$brand=='KELLOGGS'])
max_lim <- max(toasted_weekly_promo_analysis$average_promo_price[toasted_weekly_promo_analysis$brand=='KELLOGGS'])
toasted_most_competitive_promo_price <- toasted_weekly_promo_analysis %>%
ggplot(mapping = aes(x = average_promo_price, y = average_promo_units, group = brand, color = brand)) +
geom_smooth(method = "loess", se = FALSE) +
ggtitle(label = "Most competitive promotion price is around $2.50 \nfor toasted flavored products") +
ylab("Units Sold") +
xlab("Average Price") +
clean_theme +
geom_vline(xintercept = 2.50, color = "red") +
scale_x_continuous(breaks = c(0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3, 3.5),
labels=scales::dollar_format(),
limits = c(min_lim, 3.5))
toasted_most_competitive_promo_price
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
- Look at the number of stores running promotions for each brand
#Weekly promo prices by brand for regular products
toasted_store_count_promo <- toasted_weekly_promo_analysis %>%
ggplot(mapping= aes(x = week, y = promo_store_count, group = brand, color = brand)) +
geom_smooth(method = "loess", se = FALSE) +
ggtitle(label = "General Mills should run promotions in more stores", subtitle = "Kelloggs consistently runs more in store promotions per week \nthan General Mills for toasted products") +
ylab("Number of stores with promotions") +
xlab("Week") +
clean_theme
toasted_store_count_promo
## `geom_smooth()` using formula 'y ~ x'
- show non-promotion analysis in some cohesive way (below is not it)
toasted_promo_weekly_price + toasted_most_competitive_promo_price / toasted_store_count_promo
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
- Look at Average Weekly pricing for each brand when there isnt a promotion
#Weekly non=promo prices by brand for regular products
toasted_nonpromo_weekly_price <- toasted_weekly_promo_analysis %>%
ggplot(mapping= aes(x = week, y = average_non_promo_price, group = brand, color = brand)) +
geom_smooth(method = "loess", se = FALSE) +
ggtitle(label = "Average price for toasted flavored products \nwhen there is no promotion") +
ylab("Average Non-promo Price") +
xlab("Week") +
clean_theme
toasted_nonpromo_weekly_price
## `geom_smooth()` using formula 'y ~ x'
- Look at average units sold for non-promotion prices
#Weekly promo prices by brand for toasted products
min_lim <- min(toasted_weekly_promo_analysis$average_non_promo_price[toasted_weekly_promo_analysis$brand=='KELLOGGS'])
max_lim <- max(toasted_weekly_promo_analysis$average_non_promo_price[toasted_weekly_promo_analysis$brand=='KELLOGGS'])
toasted_most_competitive_nonpromo_price <- toasted_weekly_promo_analysis %>%
ggplot(mapping = aes(x = average_non_promo_price, y = average_non_promo_units, group = brand, color = brand)) +
geom_smooth(method = "loess", se = FALSE) +
ggtitle(label = "Most competitive non-promotion price is around $3.59 \nfor toasted flavored products") +
ylab("Units Sold") +
xlab("Average Price") +
clean_theme +
geom_vline(xintercept = 3.59, color = "red")+
scale_x_continuous(labels=scales::dollar_format(),
limits = c(min_lim, max_lim))
toasted_most_competitive_nonpromo_price
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
- Look at the number of stores selling toasted products each week for each brand
toasted_store_count_nonpromo <- toasted_weekly_promo_analysis %>%
ggplot(mapping= aes(x = week, y = no_promo_store_count, group = brand, color = brand)) +
geom_smooth(method = "loess", se = FALSE) +
ggtitle(label = "General Mills should sell toasted products in more stores", subtitle = "Kelloggs typically sells their products in \nmore in stores than General Mills throughout the year") +
ylab("Number of stores with promotions") +
xlab("Week") +
clean_theme
toasted_store_count_nonpromo
## `geom_smooth()` using formula 'y ~ x'
- show non-promotion analysis in some cohesive way (below is not it)
toasted_nonpromo_weekly_price + toasted_most_competitive_nonpromo_price / toasted_store_count_nonpromo
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
Looking at promotion pricing, units sold and number of stores with weekly in-store promotions
regular_promo_price_weekly <- gm_joined_data %>%
filter(promo == 1, flavor == "REGULAR") %>%
group_by(brand, week, flavor) %>%
summarise(average_promo_price = median(price)) %>%
select(-flavor)
regular_average_non_promo_weekly_price<- gm_joined_data %>%
filter(promo == 0, flavor == "REGULAR") %>%
group_by(brand, week, flavor) %>%
summarise(average_non_promo_price = median(price)) %>%
select(-flavor)
regular_average_promo_weekly_units <- gm_joined_data %>%
filter(promo == 1, flavor == "REGULAR") %>%
group_by(brand, week, flavor) %>%
summarise(average_promo_units = median(units)) %>%
select(-flavor)
regular_average_non_promo_weekly_units <- gm_joined_data %>%
filter(promo == 0, flavor == "REGULAR") %>%
group_by(brand, week, flavor) %>%
summarise(average_non_promo_units = median(units)) %>%
select(-flavor)
regular_average_promo_weekly_store_count <- gm_joined_data %>%
filter(promo == 1, flavor == "REGULAR") %>%
group_by(brand, week, flavor) %>%
summarise(promo_store_count = n()) %>%
select(-flavor)
regular_average_no_promo_weekly_store_count <-gm_joined_data %>%
filter(promo == 0, flavor == "REGULAR") %>%
group_by(brand, week, flavor) %>%
summarise(no_promo_store_count = n()) %>%
select(-flavor)
regular_weekly_promo_analysis <- right_join(regular_promo_price_weekly, regular_average_non_promo_weekly_price, by = c("brand", "week"))
regular_weekly_promo_analysis <- right_join(regular_weekly_promo_analysis, regular_average_promo_weekly_units, by = c("brand", "week"))
regular_weekly_promo_analysis <- left_join(regular_weekly_promo_analysis, regular_average_non_promo_weekly_units, by = c("brand", "week"))
regular_weekly_promo_analysis <- right_join(regular_weekly_promo_analysis, regular_average_promo_weekly_store_count, by = c("brand", "week"))
regular_weekly_promo_analysis <- left_join(regular_weekly_promo_analysis, regular_average_no_promo_weekly_store_count, by = c("brand", "week"))
- Look at Average Weekly Promotion pricing for each brand
#Weekly promo prices by brand for regular products
regular_promo_weekly_price <- regular_weekly_promo_analysis %>%
ggplot(mapping= aes(x = week, y = average_promo_price, group = brand, color = brand)) +
geom_smooth(method = "loess", se = FALSE) +
ggtitle(label = "General Mills regular products are typically priced lower than \nKelloggs when there is a promotion", subtitle = "Average weekly price for regular flavored products") +
ylab("Average Promo Price") +
xlab("Week") +
clean_theme
regular_promo_weekly_price
## `geom_smooth()` using formula 'y ~ x'
- Look at average units sold at promotion prices
#Weekly promo prices by brand for toasted products
min_lim <- min(regular_weekly_promo_analysis$average_promo_price[regular_weekly_promo_analysis$brand=='KELLOGGS'])
max_lim <- max(regular_weekly_promo_analysis$average_promo_price[regular_weekly_promo_analysis$brand=='KELLOGGS'])
regular_most_competitive_promo_price <- regular_weekly_promo_analysis %>%
ggplot(mapping = aes(x = average_promo_price, y = average_promo_units, group = brand, color = brand)) +
geom_smooth(method = "loess", se = FALSE) +
ggtitle(label = "Most competitive promotion price is around $2.74 \nfor regular flavored products") +
ylab("Units Sold") +
xlab("Average Price") +
clean_theme +
geom_vline(xintercept = 2.74, color = "red") +
scale_x_continuous(breaks = c(0.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3, 3.5),
labels=scales::dollar_format(),
limits = c(min_lim, 3.5))
regular_most_competitive_promo_price
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 25 rows containing non-finite values (stat_smooth).
- Look at the number of stores running promotions for each brand
#Weekly promo prices by brand for regular products
regular_store_count_promo <- regular_weekly_promo_analysis %>%
ggplot(mapping= aes(x = week, y = promo_store_count, group = brand, color = brand)) +
geom_smooth(method = "loess", se = FALSE) +
ggtitle(label = "General Mills should run promotions in more stores", subtitle = "Kelloggs consistently runs more in store promotions per week \nthan General Mills") +
ylab("Number of stores with promotions") +
xlab("Week") +
clean_theme +
clean_theme
regular_store_count_promo
## `geom_smooth()` using formula 'y ~ x'
- show promotion analysis in some cohesive way (below is not it)
#regular_promo_weekly_price + regular_most_competitive_promo_price / regular_store_count_promo
- Look at Average Weekly pricing for each brand when there isnt a promotion
#Weekly non=promo prices by brand for regular products
regular_nonpromo_weekly_price <- regular_weekly_promo_analysis %>%
ggplot(mapping= aes(x = week, y = average_non_promo_price, group = brand, color = brand)) +
geom_smooth(method = "loess", se = FALSE) +
ggtitle(label = "General Mills products are typically priced lower than \nKelloggs when there is no promotion", subtitle = "Average price for regular flavored products") +
ylab("Average Non-promo Price") +
xlab("Week") +
clean_theme
regular_nonpromo_weekly_price
## `geom_smooth()` using formula 'y ~ x'
- Look at average units sold for non-promotion prices
#Weekly promo prices by brand for toasted products
min_lim <- min(regular_weekly_promo_analysis$average_non_promo_price[regular_weekly_promo_analysis$brand=='KELLOGGS'])
max_lim <- max(regular_weekly_promo_analysis$average_non_promo_price[regular_weekly_promo_analysis$brand=='KELLOGGS'])
regular_most_competitive_nonpromo_price <- regular_weekly_promo_analysis %>%
ggplot(mapping = aes(x = average_non_promo_price, y = average_non_promo_units, group = brand, color = brand)) +
geom_smooth(method = "loess", se = FALSE) +
ggtitle(label = "Most competitive non-promotion price is around $4.17 \nfor regular flavored products") +
ylab("Units Sold") +
xlab("Average Price") +
clean_theme +
geom_vline(xintercept = 4.17, color = "red") +
scale_x_continuous(labels=scales::dollar_format(),
limits = c(min_lim, max_lim))
regular_most_competitive_nonpromo_price
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 30 rows containing non-finite values (stat_smooth).
- Look at the number of stores selling regular products each week for each brand
regular_store_count_nonpromo <- regular_weekly_promo_analysis %>%
ggplot(mapping= aes(x = week, y = no_promo_store_count, group = brand, color = brand)) +
geom_smooth(method = "loess", se = FALSE) +
ggtitle(label = "General Mills should sell products in more stores", subtitle = "Kelloggs consistently sells their products in \nmore in stores than General Mills") +
ylab("Number of stores with promotions") +
xlab("Week") +
clean_theme +
clean_theme
regular_store_count_nonpromo
## `geom_smooth()` using formula 'y ~ x'
- show non-promotion analysis in some cohesive way (below is not it)
regular_nonpromo_weekly_price + regular_most_competitive_nonpromo_price / regular_store_count_nonpromo
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 30 rows containing non-finite values (stat_smooth).
## `geom_smooth()` using formula 'y ~ x'
ad_week_counts <- gm_joined_data %>%
filter(ad != "NONE") %>%
select(ad, brand, week) %>%
distinct() %>%
group_by(brand, ad) %>%
summarise(count_of_weeks = n())
store_ads <- gm_joined_data %>%
filter(ad != "NONE") %>%
group_by(brand, iri_key, ad) %>%
summarise(number_of_ads = n())
ads <- store_ads %>%
group_by(brand, ad) %>%
summarise(number_of_stores_with_ads = n(),
total_ad_weeks_across_stores = sum(number_of_ads))
store_ads <- ads %>%
ggplot(mapping = aes(x = brand, y = number_of_stores_with_ads, fill = ad )) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Number of Stores with Ads", y = "Count of Stores", x = "") +
coord_flip()
week_ads <- ads %>%
ggplot(mapping = aes(x = brand, y = number_of_stores_with_ads, fill = ad )) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Number of weeks with Ads", subtitle = "total ad weeks across all stores", y = "Count of Ad Weeks", x = "") +
coord_flip()
store_ads / week_ads
Null: There is no difference in product sales of General Mills compared to Kelloggs and Post
Alternative: Product sales between General Mills and the competition differs
chisq.test(table(gm_total_sales$promo, gm_total_sales$brand))
##
## Pearson's Chi-squared test
##
## data: table(gm_total_sales$promo, gm_total_sales$brand)
## X-squared = 892.64, df = 2, p-value < 0.00000000000000022
Small p value so we reject the null that there is no difference in sales
# Make table of counts to calculate confidence interval
P_B_n <- gm_total_sales %>%
group_by(promo, brand) %>%
summarise(n = n())
## group_by: 2 grouping variables (promo, brand)
## summarise: now 6 rows and 3 columns, one group variable remaining (promo)
# Calculate confidence interval using multinomial
P_B_n_ci <- multinomialCI(t(P_B_n[,3]), 0.01)
# Create a table with proportions that is ggplot friendly
P_B_tab <- gm_total_sales %>%
group_by(promo, brand) %>%
summarise(prop = round(n()/sum(nrow(gm_total_sales)), 3))
## group_by: 2 grouping variables (promo, brand)
## summarise: now 6 rows and 3 columns, one group variable remaining (promo)
# add the confidence interval to the table of proportions
P_B_tab$ci_1 <- round(P_B_n_ci[,1], 3)
P_B_tab$ci_u <- round(P_B_n_ci[,2], 3)
htmlTable(P_B_tab)
| promo | brand | prop | ci_1 | ci_u | |
|---|---|---|---|---|---|
| 1 | 0 | GENERAL MILLS | 0.283 | 0.28 | 0.286 |
| 2 | 0 | KELLOGGS | 0.385 | 0.382 | 0.388 |
| 3 | 0 | POST | 0.06 | 0.057 | 0.063 |
| 4 | 1 | GENERAL MILLS | 0.085 | 0.082 | 0.088 |
| 5 | 1 | KELLOGGS | 0.161 | 0.158 | 0.165 |
| 6 | 1 | POST | 0.025 | 0.022 | 0.028 |
All Brands are statistically different from 0, so there is a difference across the board
# Graph of proportions with confidence intervals
P_B_tab %>%
ggplot(aes(x = brand, y = prop, fill = promo)) +
geom_bar(stat="identity", position = "dodge") +
geom_text(aes(label = round(prop, 2)), vjust = -.5, color = "black", # vjust moves labels above CI
position = position_dodge(0.9), size = 4) +
geom_errorbar(aes(ymin = ci_1, ymax = ci_u),
width = 0.4, position = position_dodge(0.9))
Kelloggs reliably sells the most units in both on and off promotion
library("PerformanceAnalytics")
correlation_chart <- gm_total_sales %>%
mutate(brand = as.integer(brand), promo = as.integer(promo)) %>%
select(brand, promo) %>%
chart.Correlation()
correlation_chart
## NULL
Not practically significant
Null: During a promotion all brand’s cereal performs similarly in units sold
Alternate: Product performance differs among the brands during a promotion
chisq.test(table(gm_total_sales$promo, gm_total_sales$cereal))
##
## Pearson's Chi-squared test
##
## data: table(gm_total_sales$promo, gm_total_sales$cereal)
## X-squared = 3424.1, df = 14, p-value < 0.00000000000000022
# Make table of counts to calculate confidence interval
P_C_n <- gm_total_sales %>%
group_by(promo, cereal) %>%
summarise(n = n())
## group_by: 2 grouping variables (promo, cereal)
## summarise: now 30 rows and 3 columns, one group variable remaining (promo)
# Calculate confidence interval using multinomial
P_C_n_ci <- multinomialCI(t(P_C_n[,3]), 0.01)
# Create a table with proportions that is ggplot friendly
P_C_tab <- gm_total_sales %>%
group_by(promo, cereal) %>%
summarise(prop = round(n()/sum(nrow(gm_total_sales)), 3))
## group_by: 2 grouping variables (promo, cereal)
## summarise: now 30 rows and 3 columns, one group variable remaining (promo)
# add the confidence interval to the table of proportions
P_C_tab$ci_1 <- round(P_C_n_ci[,1], 3)
P_C_tab$ci_u <- round(P_C_n_ci[,2], 3)
htmlTable(P_C_tab)
| promo | cereal | prop | ci_1 | ci_u | |
|---|---|---|---|---|---|
| 1 | 0 | CHEERIOS | 0.091 | 0.089 | 0.093 |
| 2 | 0 | CINNAMON TST CR | 0.066 | 0.064 | 0.068 |
| 3 | 0 | COCOA KRISPIES | 0.015 | 0.013 | 0.017 |
| 4 | 0 | COCOA PUFFS | 0.028 | 0.026 | 0.03 |
| 5 | 0 | FROOT LOOPS | 0.048 | 0.046 | 0.05 |
| 6 | 0 | FROSTED FLAKES | 0.084 | 0.082 | 0.086 |
| 7 | 0 | FROSTED MINI WHEATS | 0.06 | 0.058 | 0.062 |
| 8 | 0 | GRAPE NUTS | 0.035 | 0.033 | 0.037 |
| 9 | 0 | KIX | 0.038 | 0.036 | 0.04 |
| 10 | 0 | LUCKY CHARMS | 0.06 | 0.058 | 0.062 |
| 11 | 0 | RAISIN BRAN | 0.052 | 0.051 | 0.054 |
| 12 | 0 | RICE KRISPIES | 0.056 | 0.054 | 0.058 |
| 13 | 0 | SHREDDED WHEAT | 0.025 | 0.023 | 0.027 |
| 14 | 0 | SMART START | 0.022 | 0.02 | 0.024 |
| 15 | 0 | SPECIAL K | 0.048 | 0.046 | 0.049 |
| 16 | 1 | CHEERIOS | 0.018 | 0.016 | 0.02 |
| 17 | 1 | CINNAMON TST CR | 0.02 | 0.018 | 0.022 |
| 18 | 1 | COCOA KRISPIES | 0.009 | 0.007 | 0.011 |
| 19 | 1 | COCOA PUFFS | 0.015 | 0.013 | 0.017 |
| 20 | 1 | FROOT LOOPS | 0.026 | 0.024 | 0.028 |
| 21 | 1 | FROSTED FLAKES | 0.032 | 0.03 | 0.034 |
| 22 | 1 | FROSTED MINI WHEATS | 0.021 | 0.019 | 0.023 |
| 23 | 1 | GRAPE NUTS | 0.014 | 0.012 | 0.015 |
| 24 | 1 | KIX | 0.011 | 0.009 | 0.013 |
| 25 | 1 | LUCKY CHARMS | 0.022 | 0.02 | 0.024 |
| 26 | 1 | RAISIN BRAN | 0.016 | 0.014 | 0.018 |
| 27 | 1 | RICE KRISPIES | 0.021 | 0.019 | 0.022 |
| 28 | 1 | SHREDDED WHEAT | 0.011 | 0.009 | 0.013 |
| 29 | 1 | SMART START | 0.016 | 0.014 | 0.018 |
| 30 | 1 | SPECIAL K | 0.021 | 0.019 | 0.022 |
All cereals are statistically different from 0, so there is a difference across the board
# Graph of proportions with confidence intervals
P_C_tab %>%
ggplot(aes(x = cereal, y = prop, fill = promo)) +
geom_bar(stat="identity", position = "dodge") +
geom_text(aes(label = round(prop, 2)), vjust = -1, color = "black", # vjust moves labels above CI
position = position_dodge(0.9), size = 4) +
geom_errorbar(aes(ymin = ci_1, ymax = ci_u),
width = 0.4, position = position_dodge(0.9))
All cereals reliably sells the most units when there is no promotion
library("PerformanceAnalytics")
correlation_chart <- gm_total_sales %>%
mutate(cereal = as.integer(cereal), promo = as.integer(promo)) %>%
select(cereal, promo) %>%
chart.Correlation()
Null: All General Mills cereals perform similarly during a promotion
Alternative: Different General Mills cereals perform differently during a promotion
chisq.test(table(gm_only$promo, gm_only$cereal))
##
## Pearson's Chi-squared test
##
## data: table(gm_only$promo, gm_only$cereal)
## X-squared = 1289.5, df = 4, p-value < 0.00000000000000022
Due to the small p value we reject the null that GM cereals perform the same under a promotion
# Make table of counts to calculate confidence interval
GM_P_C_n <- gm_only %>%
group_by(promo, cereal) %>%
summarise(n = n())
## group_by: 2 grouping variables (promo, cereal)
## summarise: now 10 rows and 3 columns, one group variable remaining (promo)
# Calculate confidence interval using multinomial
GM_P_C_n_ci <- multinomialCI(t(GM_P_C_n[,3]), 0.01)
# Create a table with proportions that is ggplot friendly
GM_P_C_tab <- gm_only %>%
group_by(promo, cereal) %>%
summarise(prop = round(n()/sum(nrow(gm_only)), 3))
## group_by: 2 grouping variables (promo, cereal)
## summarise: now 10 rows and 3 columns, one group variable remaining (promo)
# add the confidence interval to the table of proportions
GM_P_C_tab$ci_1 <- round(GM_P_C_n_ci[,1], 3)
GM_P_C_tab$ci_u <- round(GM_P_C_n_ci[,2], 3)
htmlTable(GM_P_C_tab)
| promo | cereal | prop | ci_1 | ci_u | |
|---|---|---|---|---|---|
| 1 | 0 | CHEERIOS | 0.248 | 0.243 | 0.252 |
| 2 | 0 | CINNAMON TST CR | 0.179 | 0.174 | 0.183 |
| 3 | 0 | COCOA PUFFS | 0.077 | 0.072 | 0.081 |
| 4 | 0 | KIX | 0.102 | 0.098 | 0.107 |
| 5 | 0 | LUCKY CHARMS | 0.163 | 0.159 | 0.167 |
| 6 | 1 | CHEERIOS | 0.048 | 0.044 | 0.053 |
| 7 | 1 | CINNAMON TST CR | 0.053 | 0.049 | 0.058 |
| 8 | 1 | COCOA PUFFS | 0.041 | 0.037 | 0.046 |
| 9 | 1 | KIX | 0.029 | 0.025 | 0.034 |
| 10 | 1 | LUCKY CHARMS | 0.059 | 0.055 | 0.064 |
# Graph of proportions with confidence intervals
GM_P_C_tab %>%
ggplot(aes(x = cereal, y = prop, fill = promo)) +
geom_bar(stat="identity", position = "dodge") +
geom_text(aes(label = round(prop, 2)), vjust = -1, color = "black", # vjust moves labels above CI
position = position_dodge(0.9), size = 4) +
geom_errorbar(aes(ymin = ci_1, ymax = ci_u),
width = 0.4, position = position_dodge(0.9))
library("PerformanceAnalytics")
correlation_chart <- gm_only %>%
mutate(cereal = as.integer(cereal), promo = as.integer(promo)) %>%
select(cereal, promo) %>%
chart.Correlation()
correlation_chart
## NULL
Null: During a promotion all brand’s flavors perform similarly in units sold
Alternate: Product performance differs among the flavors during a promotion
chisq.test(table(gm_total_sales$promo, gm_total_sales$flavor))
##
## Pearson's Chi-squared test
##
## data: table(gm_total_sales$promo, gm_total_sales$flavor)
## X-squared = 1199.8, df = 4, p-value < 0.00000000000000022
# Make table of counts to calculate confidence interval
P_D_n <- gm_total_sales %>%
group_by(promo, flavor) %>%
summarise(n = n())
## group_by: 2 grouping variables (promo, flavor)
## summarise: now 10 rows and 3 columns, one group variable remaining (promo)
# Calculate confidence interval using multinomial
P_D_n_ci <- multinomialCI(t(P_D_n[,3]), 0.01)
# Create a table with proportions that is ggplot friendly
P_D_tab <- gm_total_sales %>%
group_by(promo, flavor) %>%
summarise(prop = round(n()/sum(nrow(gm_total_sales)), 3))
## group_by: 2 grouping variables (promo, flavor)
## summarise: now 10 rows and 3 columns, one group variable remaining (promo)
# add the confidence interval to the table of proportions
P_D_tab$ci_1 <- round(P_D_n_ci[,1], 3)
P_D_tab$ci_u <- round(P_D_n_ci[,2], 3)
htmlTable(P_D_tab)
| promo | flavor | prop | ci_1 | ci_u | |
|---|---|---|---|---|---|
| 1 | 0 | CINNAMON TOAST | 0.066 | 0.063 | 0.069 |
| 2 | 0 | COCOA | 0.043 | 0.04 | 0.046 |
| 3 | 0 | FRUIT | 0.048 | 0.045 | 0.051 |
| 4 | 0 | REGULAR | 0.294 | 0.291 | 0.297 |
| 5 | 0 | TOASTED | 0.277 | 0.274 | 0.28 |
| 6 | 1 | CINNAMON TOAST | 0.02 | 0.017 | 0.023 |
| 7 | 1 | COCOA | 0.025 | 0.022 | 0.027 |
| 8 | 1 | FRUIT | 0.026 | 0.023 | 0.029 |
| 9 | 1 | REGULAR | 0.105 | 0.102 | 0.108 |
| 10 | 1 | TOASTED | 0.096 | 0.094 | 0.099 |
All flavors are statistically different from 0, so there is a difference across the board
# Graph of proportions with confidence intervals
P_D_tab %>%
ggplot(aes(x = flavor, y = prop, fill = promo)) +
geom_bar(stat="identity", position = "dodge") +
geom_text(aes(label = round(prop, 2)), vjust = -1, color = "black", # vjust moves labels above CI
position = position_dodge(0.9), size = 4) +
geom_errorbar(aes(ymin = ci_1, ymax = ci_u),
width = 0.4, position = position_dodge(0.9))
All flavors reliably sells the most units when there is no promotion
library("PerformanceAnalytics")
correlation_chart <- gm_total_sales %>%
mutate(flavor = as.integer(flavor), promo = as.integer(promo)) %>%
select(flavor, promo) %>%
chart.Correlation()
correlation_chart
## NULL
Null: All flavors perform similarly during a promotion
Alternative: Different flavors perform differently during a promotion
chisq.test(table(gm_only$promo, gm_only$flavor))
##
## Pearson's Chi-squared test
##
## data: table(gm_only$promo, gm_only$flavor)
## X-squared = 759.14, df = 3, p-value < 0.00000000000000022
# Make table of counts to calculate confidence interval
GM_P_F_n <- gm_only %>%
group_by(promo, flavor) %>%
summarise(n = n())
## group_by: 2 grouping variables (promo, flavor)
## summarise: now 8 rows and 3 columns, one group variable remaining (promo)
# Calculate confidence interval using multinomial
GM_P_F_n_ci <- multinomialCI(t(GM_P_F_n[,3]), 0.01)
# Create a table with proportions that is ggplot friendly
GM_P_F_tab <- gm_only %>%
group_by(promo, flavor) %>%
summarise(prop = round(n()/sum(nrow(gm_only)), 3))
## group_by: 2 grouping variables (promo, flavor)
## summarise: now 8 rows and 3 columns, one group variable remaining (promo)
# add the confidence interval to the table of proportions
GM_P_F_tab$ci_1 <- round(GM_P_F_n_ci[,1], 3)
GM_P_F_tab$ci_u <- round(GM_P_F_n_ci[,2], 3)
htmlTable(GM_P_F_tab)
| promo | flavor | prop | ci_1 | ci_u | |
|---|---|---|---|---|---|
| 1 | 0 | CINNAMON TOAST | 0.179 | 0.174 | 0.184 |
| 2 | 0 | COCOA | 0.077 | 0.072 | 0.082 |
| 3 | 0 | REGULAR | 0.103 | 0.098 | 0.108 |
| 4 | 0 | TOASTED | 0.41 | 0.405 | 0.415 |
| 5 | 1 | CINNAMON TOAST | 0.053 | 0.049 | 0.058 |
| 6 | 1 | COCOA | 0.041 | 0.036 | 0.046 |
| 7 | 1 | REGULAR | 0.029 | 0.025 | 0.034 |
| 8 | 1 | TOASTED | 0.107 | 0.103 | 0.112 |
All flavors are statistically different from 0, so there is a difference across the board
# Graph of proportions with confidence intervals
GM_P_F_tab %>%
ggplot(aes(x = flavor, y = prop, fill = promo)) +
geom_bar(stat="identity", position = "dodge") +
geom_text(aes(label = round(prop, 2)), vjust = -1, color = "black", # vjust moves labels above CI
position = position_dodge(0.9), size = 4) +
geom_errorbar(aes(ymin = ci_1, ymax = ci_u),
width = 0.4, position = position_dodge(0.9))
All flavors reliably sells the most units when there is no promotion
library("PerformanceAnalytics")
correlation_chart <- gm_only %>%
mutate(flavor = as.integer(flavor), promo = as.integer(promo)) %>%
select(flavor, promo) %>%
chart.Correlation()
correlation_chart
## NULL
Null: There is no difference in ad performance between brands
Alternative: Ad performance varies between brands
chisq.test(table(gm_total_sales$ad, gm_total_sales$brand))
##
## Pearson's Chi-squared test
##
## data: table(gm_total_sales$ad, gm_total_sales$brand)
## X-squared = 737.73, df = 4, p-value < 0.00000000000000022
Small p value so we reject the null that there is no difference in ad usage
# Make table of counts to calculate confidence interval
A_B_n <- gm_total_sales %>%
group_by(ad, brand) %>%
summarise(n = n())
## group_by: 2 grouping variables (ad, brand)
## summarise: now 9 rows and 3 columns, one group variable remaining (ad)
# Calculate confidence interval using multinomial
A_B_n_ci <- multinomialCI(t(A_B_n[,3]), 0.01)
# Create a table with proportions that is ggplot friendly
A_B_tab <- gm_total_sales %>%
group_by(ad, brand) %>%
summarise(prop = round(n()/sum(nrow(gm_total_sales)), 3))
## group_by: 2 grouping variables (ad, brand)
## summarise: now 9 rows and 3 columns, one group variable remaining (ad)
# add the confidence interval to the table of proportions
A_B_tab$ci_1 <- round(A_B_n_ci[,1], 3)
A_B_tab$ci_u <- round(A_B_n_ci[,2], 3)
htmlTable(A_B_tab)
| ad | brand | prop | ci_1 | ci_u | |
|---|---|---|---|---|---|
| 1 | A | GENERAL MILLS | 0.028 | 0.025 | 0.031 |
| 2 | A | KELLOGGS | 0.05 | 0.047 | 0.053 |
| 3 | A | POST | 0.006 | 0.003 | 0.009 |
| 4 | B | GENERAL MILLS | 0.017 | 0.014 | 0.02 |
| 5 | B | KELLOGGS | 0.04 | 0.037 | 0.043 |
| 6 | B | POST | 0.006 | 0.003 | 0.009 |
| 7 | NONE | GENERAL MILLS | 0.323 | 0.32 | 0.326 |
| 8 | NONE | KELLOGGS | 0.457 | 0.454 | 0.46 |
| 9 | NONE | POST | 0.073 | 0.07 | 0.076 |
All Brands are statistically different from 0, so there is a difference across the board
# Graph of proportions with confidence intervals
A_B_tab %>%
ggplot(aes(x = brand, y = prop, fill = ad)) +
geom_bar(stat="identity", position = "dodge") +
geom_text(aes(label = round(prop, 2)), vjust = -.5, color = "black", # vjust moves labels above CI
position = position_dodge(0.9), size = 4) +
geom_errorbar(aes(ymin = ci_1, ymax = ci_u),
width = 0.4, position = position_dodge(0.9))
All Brands reliably sell the most units when there is no advertisement
library("PerformanceAnalytics")
correlation_chart <- gm_total_sales %>%
mutate(brand = as.integer(brand), ad = as.integer(ad)) %>%
select(brand, ad) %>%
chart.Correlation()
correlation_chart
## NULL
Not practically significant
Null: There is no difference in product performance with ad usage
Alternative: Product sales differ with ad usage
chisq.test(table(gm_only$ad, gm_only$cereal))
##
## Pearson's Chi-squared test
##
## data: table(gm_only$ad, gm_only$cereal)
## X-squared = 558.41, df = 8, p-value < 0.00000000000000022
# Make table of counts to calculate confidence interval
GM_A_C_n <- gm_only %>%
group_by(ad, cereal) %>%
summarise(n = n())
## group_by: 2 grouping variables (ad, cereal)
## summarise: now 15 rows and 3 columns, one group variable remaining (ad)
# Calculate confidence interval using multinomial
GM_A_C_n_ci <- multinomialCI(t(GM_A_C_n[,3]), 0.01)
# Create a table with proportions that is ggplot friendly
GM_A_C_tab <- gm_only %>%
group_by(ad, cereal) %>%
summarise(prop = round(n()/sum(nrow(gm_only)), 3))
## group_by: 2 grouping variables (ad, cereal)
## summarise: now 15 rows and 3 columns, one group variable remaining (ad)
# add the confidence interval to the table of proportions
GM_A_C_tab$ci_1 <- round(GM_A_C_n_ci[,1], 3)
GM_A_C_tab$ci_u <- round(GM_A_C_n_ci[,2], 3)
htmlTable(GM_A_C_tab)
| ad | cereal | prop | ci_1 | ci_u | |
|---|---|---|---|---|---|
| 1 | A | CHEERIOS | 0.016 | 0.012 | 0.021 |
| 2 | A | CINNAMON TST CR | 0.018 | 0.014 | 0.023 |
| 3 | A | COCOA PUFFS | 0.012 | 0.007 | 0.017 |
| 4 | A | KIX | 0.01 | 0.005 | 0.015 |
| 5 | A | LUCKY CHARMS | 0.019 | 0.014 | 0.024 |
| 6 | B | CHEERIOS | 0.011 | 0.006 | 0.015 |
| 7 | B | CINNAMON TST CR | 0.008 | 0.004 | 0.013 |
| 8 | B | COCOA PUFFS | 0.009 | 0.004 | 0.014 |
| 9 | B | KIX | 0.008 | 0.003 | 0.013 |
| 10 | B | LUCKY CHARMS | 0.011 | 0.006 | 0.016 |
| 11 | NONE | CHEERIOS | 0.269 | 0.265 | 0.274 |
| 12 | NONE | CINNAMON TST CR | 0.206 | 0.201 | 0.21 |
| 13 | NONE | COCOA PUFFS | 0.097 | 0.092 | 0.102 |
| 14 | NONE | KIX | 0.113 | 0.109 | 0.118 |
| 15 | NONE | LUCKY CHARMS | 0.192 | 0.187 | 0.197 |
All products are statistically different from 0, so there is a difference across the board
# Graph of proportions with confidence intervals
GM_A_C_tab %>%
ggplot(aes(x = cereal, y = prop, fill = ad)) +
geom_bar(stat="identity", position = "dodge") +
geom_text(aes(label = round(prop, 2)), vjust = -1, color = "black", # vjust moves labels above CI
position = position_dodge(0.9), size = 4) +
geom_errorbar(aes(ymin = ci_1, ymax = ci_u),
width = 0.4, position = position_dodge(0.9))
library("PerformanceAnalytics")
correlation_chart <- gm_only %>%
mutate(cereal = as.integer(cereal), ad = as.integer(ad)) %>%
select(cereal, ad) %>%
chart.Correlation()
correlation_chart
## NULL
Null: All flavors perform similarly during ad usage
Alternative: Flavor performance changes with ad usage
chisq.test(table(gm_only$ad, gm_only$flavor))
##
## Pearson's Chi-squared test
##
## data: table(gm_only$ad, gm_only$flavor)
## X-squared = 394.12, df = 6, p-value < 0.00000000000000022
# Make table of counts to calculate confidence interval
GM_A_F_n <- gm_only %>%
group_by(ad, flavor) %>%
summarise(n = n())
## group_by: 2 grouping variables (ad, flavor)
## summarise: now 12 rows and 3 columns, one group variable remaining (ad)
# Calculate confidence interval using multinomial
GM_A_F_n_ci <- multinomialCI(t(GM_A_F_n[,3]), 0.01)
# Create a table with proportions that is ggplot friendly
GM_A_F_tab <- gm_only %>%
group_by(ad, flavor) %>%
summarise(prop = round(n()/sum(nrow(gm_only)), 3))
## group_by: 2 grouping variables (ad, flavor)
## summarise: now 12 rows and 3 columns, one group variable remaining (ad)
# add the confidence interval to the table of proportions
GM_A_F_tab$ci_1 <- round(GM_A_F_n_ci[,1], 3)
GM_A_F_tab$ci_u <- round(GM_A_F_n_ci[,2], 3)
htmlTable(GM_A_F_tab)
| ad | flavor | prop | ci_1 | ci_u | |
|---|---|---|---|---|---|
| 1 | A | CINNAMON TOAST | 0.018 | 0.014 | 0.023 |
| 2 | A | COCOA | 0.012 | 0.007 | 0.017 |
| 3 | A | REGULAR | 0.01 | 0.005 | 0.015 |
| 4 | A | TOASTED | 0.035 | 0.03 | 0.04 |
| 5 | B | CINNAMON TOAST | 0.008 | 0.003 | 0.013 |
| 6 | B | COCOA | 0.009 | 0.004 | 0.014 |
| 7 | B | REGULAR | 0.008 | 0.003 | 0.013 |
| 8 | B | TOASTED | 0.022 | 0.017 | 0.026 |
| 9 | NONE | CINNAMON TOAST | 0.206 | 0.201 | 0.21 |
| 10 | NONE | COCOA | 0.097 | 0.092 | 0.102 |
| 11 | NONE | REGULAR | 0.114 | 0.109 | 0.119 |
| 12 | NONE | TOASTED | 0.46 | 0.456 | 0.465 |
All flavors are statistically different from 0, so there is a difference across the board
# Graph of proportions with confidence intervals
GM_A_F_tab %>%
ggplot(aes(x = flavor, y = prop, fill = ad)) +
geom_bar(stat="identity", position = "dodge") +
geom_text(aes(label = round(prop, 2)), vjust = -1, color = "black", # vjust moves labels above CI
position = position_dodge(0.9), size = 4) +
geom_errorbar(aes(ymin = ci_1, ymax = ci_u),
width = 0.4, position = position_dodge(0.9))
library("PerformanceAnalytics")
correlation_chart <- gm_only %>%
mutate(flavor = as.integer(flavor), ad = as.integer(ad)) %>%
select(flavor, ad) %>%
chart.Correlation()
correlation_chart
## NULL
Null: There is no difference in the number of stores running weekly promotions for each brands
Alternative: Different brands have dissimilar promotion usage in stores
promo_price_weekly <- gm_total_sales %>%
filter(promo == 1) %>%
group_by(brand, week) %>%
summarise(average_promo_price = median(price))
## filter: removed 136,561 rows (73%), 50,889 rows remaining
## group_by: 2 grouping variables (brand, week)
## summarise: now 156 rows and 3 columns, one group variable remaining (brand)
average_non_promo_weekly_price<- gm_total_sales %>%
filter(promo == 0) %>%
group_by(brand, week) %>%
summarise(average_non_promo_price = median(price))
## filter: removed 50,889 rows (27%), 136,561 rows remaining
## group_by: 2 grouping variables (brand, week)
## summarise: now 156 rows and 3 columns, one group variable remaining (brand)
average_promo_weekly_units <- gm_joined_data %>%
filter(promo == 1) %>%
group_by(brand, week) %>%
summarise(average_promo_units = median(units))
## filter: removed 17,305 rows (79%), 4,545 rows remaining
## group_by: 2 grouping variables (brand, week)
## summarise: now 156 rows and 3 columns, one group variable remaining (brand)
average_non_promo_weekly_units <- gm_joined_data %>%
filter(promo == 0) %>%
group_by(brand, week) %>%
summarise(average_no_promo_units = median(units))
## filter: removed 4,545 rows (21%), 17,305 rows remaining
## group_by: 2 grouping variables (brand, week)
## summarise: now 156 rows and 3 columns, one group variable remaining (brand)
average_promo_weekly_store_count <- gm_joined_data %>%
filter(promo == 1) %>%
group_by(brand, week) %>%
summarise(promo_store_count = n())
## filter: removed 17,305 rows (79%), 4,545 rows remaining
## group_by: 2 grouping variables (brand, week)
## summarise: now 156 rows and 3 columns, one group variable remaining (brand)
average_no_promo_weekly_store_count <-gm_joined_data %>%
filter(promo == 0) %>%
group_by(brand, week) %>%
summarise(no_promo_store_count = n())
## filter: removed 4,545 rows (21%), 17,305 rows remaining
## group_by: 2 grouping variables (brand, week)
## summarise: now 156 rows and 3 columns, one group variable remaining (brand)
weekly_promo_analysis <- right_join(promo_price_weekly, average_non_promo_weekly_price, by = c("brand", "week"))
## right_join: added one column (average_non_promo_price)
## > rows only in x ( 0)
## > rows only in y 0
## > matched rows 156
## > =====
## > rows total 156
weekly_promo_analysis <- right_join(weekly_promo_analysis, average_promo_weekly_units, by = c("brand", "week"))
## right_join: added one column (average_promo_units)
## > rows only in x ( 0)
## > rows only in y 0
## > matched rows 156
## > =====
## > rows total 156
weekly_promo_analysis <- right_join(weekly_promo_analysis, average_non_promo_weekly_units, by = c("brand", "week"))
## right_join: added one column (average_no_promo_units)
## > rows only in x ( 0)
## > rows only in y 0
## > matched rows 156
## > =====
## > rows total 156
weekly_promo_analysis <- right_join(weekly_promo_analysis, average_promo_weekly_store_count, by = c("brand", "week"))
## right_join: added one column (promo_store_count)
## > rows only in x ( 0)
## > rows only in y 0
## > matched rows 156
## > =====
## > rows total 156
weekly_promo_analysis <- right_join(weekly_promo_analysis, average_no_promo_weekly_store_count, by = c("brand", "week"))
## right_join: added one column (no_promo_store_count)
## > rows only in x ( 0)
## > rows only in y 0
## > matched rows 156
## > =====
## > rows total 156
# Set up data set for regression
vtable(weekly_promo_analysis)
| Name | Class | Values |
|---|---|---|
| brand | factor | ‘GENERAL MILLS’ ‘KELLOGGS’ ‘POST’ |
| week | integer | Num: 1 to 52 |
| average_promo_price | numeric | Num: 2.12 to 4.08 |
| average_non_promo_price | numeric | Num: 3.45 to 4.29 |
| average_promo_units | numeric | Num: 1.5 to 18 |
| average_no_promo_units | numeric | Num: 2 to 9 |
| promo_store_count | integer | Num: 2 to 70 |
| no_promo_store_count | integer | Num: 22 to 237 |
# Linear regression with linear model
store_promo_regression <- lm(promo_store_count ~ brand + week, data = weekly_promo_analysis)
# Review output
summary(store_promo_regression)
##
## Call:
## lm(formula = promo_store_count ~ brand + week, data = weekly_promo_analysis)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.0580 -4.1579 -0.0225 3.9540 18.7438
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.92873 1.42473 18.199 <0.0000000000000002 ***
## brandKELLOGGS 27.55769 1.41097 19.531 <0.0000000000000002 ***
## brandPOST -14.00000 1.41097 -9.922 <0.0000000000000002 ***
## week -0.04956 0.03838 -1.291 0.199
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.195 on 152 degrees of freedom
## Multiple R-squared: 0.8555, Adjusted R-squared: 0.8527
## F-statistic: 300 on 3 and 152 DF, p-value: < 0.00000000000000022
par(mfrow = c(1, 1))
plot(weekly_promo_analysis$promo_store_count, store_promo_regression$residuals)
# Pull out the coefficients and confidence interval for table and graph
coe <- summary(store_promo_regression)$coefficients # get coefficients and related stats
coe_CI <- as.data.frame(cbind(coe[-1, ], confint(store_promo_regression)[-1, ])) # find and bind CI, remove Intercept
# Rename results data frame
names(coe_CI) <- c("estimate", "se", "t", "pval","low_CI","high_CI")
htmlTable(round(coe_CI[order(coe_CI$pval, decreasing = FALSE), ], 3))
| estimate | se | t | pval | low_CI | high_CI | |
|---|---|---|---|---|---|---|
| brandKELLOGGS | 27.558 | 1.411 | 19.531 | 0 | 24.77 | 30.345 |
| brandPOST | -14 | 1.411 | -9.922 | 0 | -16.788 | -11.212 |
| week | -0.05 | 0.038 | -1.291 | 0.199 | -0.125 | 0.026 |
# Cleveland dot plot of results
ggplot(coe_CI, aes(x = estimate, y = row.names(coe_CI))) +
geom_point(size = 3) +
xlim(min(coe_CI$low_CI), max(coe_CI$high_CI)) +
ylab("Variable") +
xlab("Coefficient") +
theme_bw()+
geom_segment(aes(yend = reorder(row.names(coe_CI),desc(pval))),
xend = coe_CI$high_CI, color = "Blue") +
geom_segment(aes(yend = reorder(row.names(coe_CI),desc(coe_CI$pval))),
xend = coe_CI$low_CI, color = "Blue") +
xlab("Coefficient with Confidence Interval")+
geom_vline(xintercept = 0, color = "red")
## Warning: Use of `coe_CI$pval` is discouraged. Use `pval` instead.
+ Kelloggs and Post reliably run promotions in a different number of stores than General Mills
- We reject the null hypothesis that General Mills has the same number of stores running promotions as Kelloggs and Post
- General Mills runs promotions in less stores than Kelloggs and more than Post at a 99% confidence interval
library("PerformanceAnalytics")
correlation_chart <- weekly_promo_analysis %>%
mutate(brand = as.integer(brand)) %>%
select(brand, promo_store_count) %>%
chart.Correlation()
Null: There is no difference in promotion usage between brands for toasted flavored products
Alternative: Different brands have dissimilar promotion usage for toasted flavored products
# Set up data set for regression
vtable(toasted_weekly_promo_analysis)
| Name | Class | Values |
|---|---|---|
| brand | factor | ‘GENERAL MILLS’ ‘KELLOGGS’ ‘POST’ |
| week | integer | Num: 1 to 52 |
| average_promo_price | numeric | Num: 2 to 3.99 |
| average_non_promo_price | numeric | Num: 3.59 to 4.39 |
| average_promo_units | numeric | Num: 3 to 23 |
| average_non_promo_units | numeric | Num: 4 to 13.5 |
| promo_store_count | integer | Num: 4 to 37 |
| no_promo_store_count | integer | Num: 32 to 81 |
# Linear regression with linear model
toasted_store_promo_regression <- lm(promo_store_count ~ brand + week, data = toasted_weekly_promo_analysis)
# Review output
summary(toasted_store_promo_regression)
##
## Call:
## lm(formula = promo_store_count ~ brand + week, data = toasted_weekly_promo_analysis)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.232 -2.959 -0.912 2.352 18.423
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.82183 1.02203 10.589 < 0.0000000000000002 ***
## brandKELLOGGS 8.05769 0.90356 8.918 0.0000000000000216 ***
## week -0.02158 0.03010 -0.717 0.475
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.607 on 101 degrees of freedom
## Multiple R-squared: 0.4421, Adjusted R-squared: 0.4311
## F-statistic: 40.02 on 2 and 101 DF, p-value: 0.0000000000001587
par(mfrow = c(1, 1))
plot(toasted_weekly_promo_analysis$promo_store_count, toasted_store_promo_regression$residuals)
# Pull out the coefficients and confidence interval for table and graph
coe <- summary(toasted_store_promo_regression)$coefficients # get coefficients and related stats
coe_CI <- as.data.frame(cbind(coe[-1, ], confint(toasted_store_promo_regression)[-1, ])) # find and bind CI, remove Intercept
# Rename results data frame
names(coe_CI) <- c("estimate", "se", "t", "pval","low_CI","high_CI")
htmlTable(round(coe_CI[order(coe_CI$pval, decreasing = FALSE), ], 3))
| estimate | se | t | pval | low_CI | high_CI | |
|---|---|---|---|---|---|---|
| brandKELLOGGS | 8.058 | 0.904 | 8.918 | 0 | 6.265 | 9.85 |
| week | -0.022 | 0.03 | -0.717 | 0.475 | -0.081 | 0.038 |
# Cleveland dot plot of results
ggplot(coe_CI, aes(x = estimate, y = row.names(coe_CI))) +
geom_point(size = 3) +
xlim(min(coe_CI$low_CI), max(coe_CI$high_CI)) +
ylab("Variable") +
xlab("Coefficient") +
theme_bw()+
geom_segment(aes(yend = reorder(row.names(coe_CI),desc(pval))),
xend = coe_CI$high_CI, color = "Blue") +
geom_segment(aes(yend = reorder(row.names(coe_CI),desc(coe_CI$pval))),
xend = coe_CI$low_CI, color = "Blue") +
xlab("Coefficient with Confidence Interval")+
geom_vline(xintercept = 0, color = "red")
## Warning: Use of `coe_CI$pval` is discouraged. Use `pval` instead.
Kelloggs reliably runs promotions for toasted products in a different number of stores than General Mills
library("PerformanceAnalytics")
correlation_chart <- toasted_weekly_promo_analysis %>%
mutate(brand = as.integer(brand)) %>%
select(brand, promo_store_count) %>%
chart.Correlation()
Null: There is no difference in promotion usage between brands for regular flavored products
Alternative: Different brands have dissimilar promotion usage for regular flavored products
# Set up data set for regression
vtable(regular_weekly_promo_analysis)
| Name | Class | Values |
|---|---|---|
| brand | factor | ‘GENERAL MILLS’ ‘KELLOGGS’ ‘POST’ |
| week | integer | Num: 1 to 52 |
| average_promo_price | numeric | Num: 2 to 4.67 |
| average_non_promo_price | numeric | Num: 3.185 to 4.19 |
| average_promo_units | numeric | Num: 1 to 21 |
| average_non_promo_units | numeric | Num: 2 to 12 |
| promo_store_count | integer | Num: 1 to 34 |
| no_promo_store_count | integer | Num: 9 to 104 |
# Linear regression with linear model
regular_store_promo_regression <- lm(promo_store_count ~ brand + week, data = regular_weekly_promo_analysis)
# Review output
summary(regular_store_promo_regression)
##
## Call:
## lm(formula = promo_store_count ~ brand + week, data = regular_weekly_promo_analysis)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.8851 -2.2786 -0.2662 1.8530 14.2890
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.05065 0.79850 6.325 0.0000000027190049 ***
## brandKELLOGGS 15.48735 0.79231 19.547 < 0.0000000000000002 ***
## brandPOST 6.71812 0.79231 8.479 0.0000000000000192 ***
## week -0.04352 0.02146 -2.028 0.0443 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.02 on 151 degrees of freedom
## Multiple R-squared: 0.7202, Adjusted R-squared: 0.7146
## F-statistic: 129.5 on 3 and 151 DF, p-value: < 0.00000000000000022
par(mfrow = c(1, 1))
plot(regular_weekly_promo_analysis$promo_store_count, regular_store_promo_regression$residuals)
# Pull out the coefficients and confidence interval for table and graph
coe <- summary(regular_store_promo_regression)$coefficients # get coefficients and related stats
coe_CI <- as.data.frame(cbind(coe[-1, ], confint(regular_store_promo_regression)[-1, ])) # find and bind CI, remove Intercept
# Rename results data frame
names(coe_CI) <- c("estimate", "se", "t", "pval","low_CI","high_CI")
htmlTable(round(coe_CI[order(coe_CI$pval, decreasing = FALSE), ], 3))
| estimate | se | t | pval | low_CI | high_CI | |
|---|---|---|---|---|---|---|
| brandKELLOGGS | 15.487 | 0.792 | 19.547 | 0 | 13.922 | 17.053 |
| brandPOST | 6.718 | 0.792 | 8.479 | 0 | 5.153 | 8.284 |
| week | -0.044 | 0.021 | -2.028 | 0.044 | -0.086 | -0.001 |
# Cleveland dot plot of results
ggplot(coe_CI, aes(x = estimate, y = row.names(coe_CI))) +
geom_point(size = 3) +
xlim(min(coe_CI$low_CI), max(coe_CI$high_CI)) +
ylab("Variable") +
xlab("Coefficient") +
theme_bw()+
geom_segment(aes(yend = reorder(row.names(coe_CI),desc(pval))),
xend = coe_CI$high_CI, color = "Blue") +
geom_segment(aes(yend = reorder(row.names(coe_CI),desc(coe_CI$pval))),
xend = coe_CI$low_CI, color = "Blue") +
xlab("Coefficient with Confidence Interval")+
geom_vline(xintercept = 0, color = "red")
## Warning: Use of `coe_CI$pval` is discouraged. Use `pval` instead.
Kelloggs reliably runs promotions for regular products in a different number of stores than General Mills
library("PerformanceAnalytics")
correlation_chart <- regular_weekly_promo_analysis %>%
mutate(brand = as.integer(brand)) %>%
select(brand, promo_store_count) %>%
chart.Correlation()
Null: There is no difference in weekly advertisement usage among brands
Alternative: Different brands have different weekly advertisement frequencies
ad_week_counts <- gm_joined_data %>%
filter(ad != "NONE") %>%
select(ad, brand, week) %>%
distinct() %>%
group_by(brand, ad) %>%
summarise(count_of_weeks = n())
## filter: removed 19,333 rows (88%), 2,517 rows remaining
## select: dropped 9 variables (UPC, iri_key, units, price, promo, …)
## distinct: removed 2,224 rows (88%), 293 rows remaining
## group_by: 2 grouping variables (brand, ad)
## summarise: now 6 rows and 3 columns, one group variable remaining (brand)
store_ad <- gm_joined_data %>%
filter(ad != "NONE") %>%
group_by(brand, iri_key, ad) %>%
summarise(number_of_ads = n())
## filter: removed 19,333 rows (88%), 2,517 rows remaining
## group_by: 3 grouping variables (brand, iri_key, ad)
## summarise: now 1,747 rows and 4 columns, 2 group variables remaining (brand, iri_key)
ads <- store_ad %>%
group_by(brand, ad) %>%
summarise(number_of_store_ads = n(),
total_number_of_ad_weeks = sum(number_of_ads)) %>%
select(brand, ad, number_of_store_ads, total_number_of_ad_weeks)
## group_by: 2 grouping variables (brand, ad)
## summarise: now 6 rows and 4 columns, one group variable remaining (brand)
## select: no changes
lm_data <- left_join(gm_joined_data, ads, by = c("brand", "ad")) %>%
filter(ad != "NONE")
## left_join: added 2 columns (number_of_store_ads, total_number_of_ad_weeks)
## > rows only in x 19,333
## > rows only in y ( 0)
## > matched rows 2,517
## > ========
## > rows total 21,850
## filter: removed 19,333 rows (88%), 2,517 rows remaining
weekly_ads <- gm_joined_data %>%
filter(ad != "NONE") %>%
group_by(week, brand) %>%
mutate(advertisement = case_when(
ad == "A" ~ 1,
ad == "B" ~ 1),
total_weekly_ads = sum(advertisement))
## filter: removed 19,333 rows (88%), 2,517 rows remaining
## group_by: 2 grouping variables (week, brand)
## mutate (grouped): new variable 'advertisement' (double) with one unique value and 0% NA
## new variable 'total_weekly_ads' (double) with 43 unique values and 0% NA
weekly_ads <- weekly_ads %>%
group_by(brand) %>%
mutate(average_weekly_ads = median(total_weekly_ads))
## group_by: one grouping variable (brand)
## mutate (grouped): new variable 'average_weekly_ads' (double) with 3 unique values and 0% NA
# Set up data set for regression
vtable(weekly_ads)
| Name | Class | Values |
|---|---|---|
| UPC | character | |
| iri_key | factor | ‘200171’ ‘200197’ ‘200272’ ‘200297’ ‘200341’ and more |
| week | integer | Num: 1 to 52 |
| units | integer | Num: 1 to 28 |
| price | numeric | Num: 0.5 to 6.49 |
| promo | factor | ‘0’ ‘1’ |
| ad | factor | ‘A’ ‘B’ ‘NONE’ |
| brand | factor | ‘GENERAL MILLS’ ‘KELLOGGS’ ‘POST’ |
| flavor | factor | ‘CINNAMON TOAST’ ‘COCOA’ ‘FRUIT’ ‘REGULAR’ ‘TOASTED’ |
| volume | numeric | Num: 0.06 to 2 |
| package | factor | ‘BOX’ ‘CUP’ |
| cereal | factor | ‘CHEERIOS’ ‘CINNAMON TST CR’ ‘COCOA KRISPIES’ ‘COCOA PUFFS’ ‘FROOT LOOPS’ and more |
| advertisement | numeric | Num: 1 to 1 |
| total_weekly_ads | numeric | Num: 1 to 67 |
| average_weekly_ads | numeric | Num: 6 to 33 |
# Linear regression with linear model
mod <- lm(total_weekly_ads ~ brand + week, data = weekly_ads)
# Review output
summary(mod)
##
## Call:
## lm(formula = total_weekly_ads ~ brand + week, data = weekly_ads)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.761 -6.697 -1.510 4.740 26.864
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.28846 0.52674 48.01 <0.0000000000000002 ***
## brandKELLOGGS 17.03533 0.46947 36.29 <0.0000000000000002 ***
## brandPOST -12.17166 0.78156 -15.57 <0.0000000000000002 ***
## week -0.31254 0.01447 -21.59 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.39 on 2513 degrees of freedom
## Multiple R-squared: 0.5344, Adjusted R-squared: 0.5338
## F-statistic: 961.4 on 3 and 2513 DF, p-value: < 0.00000000000000022
Small p value so we reject the null that there is no difference in number of stores where ads are ran
Kelloggs reliably runs advertisements for a different number of weeks
par(mfrow = c(1, 1))
plot(weekly_ads$total_weekly_ads, mod$residuals)
# Pull out the coefficients and confidence interval for table and graph
coe <- summary(mod)$coefficients # get coefficients and related stats
coe_CI <- as.data.frame(cbind(coe[-1, ], confint(mod)[-1, ])) # find and bind CI, remove Intercept
# Rename results data frame
names(coe_CI) <- c("estimate", "se", "t", "pval","low_CI","high_CI")
htmlTable(round(coe_CI[order(coe_CI$pval, decreasing = FALSE), ], 3))
| estimate | se | t | pval | low_CI | high_CI | |
|---|---|---|---|---|---|---|
| brandKELLOGGS | 17.035 | 0.469 | 36.286 | 0 | 16.115 | 17.956 |
| week | -0.313 | 0.014 | -21.595 | 0 | -0.341 | -0.284 |
| brandPOST | -12.172 | 0.782 | -15.574 | 0 | -13.704 | -10.639 |
# Cleveland dot plot of results
ggplot(coe_CI, aes(x = estimate, y = row.names(coe_CI))) +
geom_point(size = 3) +
xlim(min(coe_CI$low_CI), max(coe_CI$high_CI)) +
ylab("Variable") +
xlab("Coefficient") +
theme_bw()+
geom_segment(aes(yend = reorder(row.names(coe_CI),desc(pval))),
xend = coe_CI$high_CI, color = "Blue") +
geom_segment(aes(yend = reorder(row.names(coe_CI),desc(coe_CI$pval))),
xend = coe_CI$low_CI, color = "Blue") +
xlab("Coefficient with Confidence Interval")+
geom_vline(xintercept = 0, color = "red")
## Warning: Use of `coe_CI$pval` is discouraged. Use `pval` instead.
Null: There is no difference in the number of stores where advertisement are ran among brands
Alternative: Different brands have different numbers of stores where advertisement are ran
weekly_ad_analysis <- gm_joined_data %>%
group_by(brand, week, ad) %>%
summarise(average_ad_price = median(price),
average_ad_units = median(units),
ad_store_count = n())
# Set up data set for regression
vtable(weekly_ad_analysis)
| Name | Class | Values |
|---|---|---|
| brand | factor | ‘GENERAL MILLS’ ‘KELLOGGS’ ‘POST’ |
| week | integer | Num: 1 to 52 |
| ad | factor | ‘A’ ‘B’ ‘NONE’ |
| average_ad_price | numeric | Num: 1.33 to 4.59 |
| average_ad_units | numeric | Num: 1 to 28 |
| ad_store_count | integer | Num: 1 to 262 |
# Linear regression with linear model
store_ad_regression <- lm(ad_store_count ~ brand + week + ad, data = weekly_ad_analysis)
# Review output
summary(store_ad_regression)
##
## Call:
## lm(formula = ad_store_count ~ brand + week + ad, data = weekly_ad_analysis)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67.252 -24.615 -1.529 25.541 105.690
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.587902 4.136168 2.076 0.0384 *
## brandKELLOGGS 31.920584 3.512054 9.089 <0.0000000000000002 ***
## brandPOST -33.160200 3.619141 -9.162 <0.0000000000000002 ***
## week 0.004452 0.097599 0.046 0.9636
## adB -3.227771 3.613125 -0.893 0.3722
## adNONE 115.636799 3.548836 32.584 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30.92 on 443 degrees of freedom
## Multiple R-squared: 0.7973, Adjusted R-squared: 0.795
## F-statistic: 348.6 on 5 and 443 DF, p-value: < 0.00000000000000022
Small p value so we reject the null that there is no difference in number of stores where ads are ran
Kelloggs reliably runs advertisements in a different number of stores
par(mfrow = c(1, 1))
plot(weekly_ad_analysis$ad_store_count, store_ad_regression$residuals)
# Pull out the coefficients and confidence interval for table and graph
coe <- summary(store_ad_regression)$coefficients # get coefficients and related stats
coe_CI <- as.data.frame(cbind(coe[-1, ], confint(store_ad_regression)[-1, ])) # find and bind CI, remove Intercept
# Rename results data frame
names(coe_CI) <- c("estimate", "se", "t", "pval","low_CI","high_CI")
htmlTable(round(coe_CI[order(coe_CI$pval, decreasing = FALSE), ], 3))
| estimate | se | t | pval | low_CI | high_CI | |
|---|---|---|---|---|---|---|
| adNONE | 115.637 | 3.549 | 32.584 | 0 | 108.662 | 122.611 |
| brandPOST | -33.16 | 3.619 | -9.162 | 0 | -40.273 | -26.047 |
| brandKELLOGGS | 31.921 | 3.512 | 9.089 | 0 | 25.018 | 38.823 |
| adB | -3.228 | 3.613 | -0.893 | 0.372 | -10.329 | 3.873 |
| week | 0.004 | 0.098 | 0.046 | 0.964 | -0.187 | 0.196 |
# Cleveland dot plot of results
ggplot(coe_CI, aes(x = estimate, y = row.names(coe_CI))) +
geom_point(size = 3) +
xlim(min(coe_CI$low_CI), max(coe_CI$high_CI)) +
ylab("Variable") +
xlab("Coefficient") +
theme_bw()+
geom_segment(aes(yend = reorder(row.names(coe_CI),desc(pval))),
xend = coe_CI$high_CI, color = "Blue") +
geom_segment(aes(yend = reorder(row.names(coe_CI),desc(coe_CI$pval))),
xend = coe_CI$low_CI, color = "Blue") +
xlab("Coefficient with Confidence Interval")+
geom_vline(xintercept = 0, color = "red")
## Warning: Use of `coe_CI$pval` is discouraged. Use `pval` instead.
store_ads <- gm_joined_data %>%
filter(ad != "NONE") %>%
group_by(brand, iri_key, ad) %>%
summarise(number_of_ads = n())
## filter: removed 19,333 rows (88%), 2,517 rows remaining
## group_by: 3 grouping variables (brand, iri_key, ad)
## summarise: now 1,747 rows and 4 columns, 2 group variables remaining (brand, iri_key)
ads <- store_ads %>%
group_by(brand, ad) %>%
summarise(number_of_stores_with_ads = n(),
total_ad_weeks_across_stores = sum(number_of_ads))
## group_by: 2 grouping variables (brand, ad)
## summarise: now 6 rows and 4 columns, one group variable remaining (brand)
# Create graph for use in memo
store_ads_final <- ads %>%
ggplot(mapping = aes(x = reorder(brand, number_of_stores_with_ads), y = number_of_stores_with_ads, fill = ad )) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Kelloggs runs the most ads in the most stores",
subtitle = "Leading in both small and medium sized ads", y = "Stores", x = "", fill = "Advertisment Size") +
coord_flip() +
theme_classic() +
theme(axis.line.y = element_blank(),
axis.ticks.y = element_blank()) +
scale_fill_manual(values = c("#CCCCFF", "#336699"), labels = c("Small Ad", "Medium Ad"))
ggsave(filename = "store_ads_final.png", plot = store_ads_final)
## Saving 7 x 5 in image
store_ads_final
promotions <- gm_joined_data %>%
filter(promo == 1) %>%
group_by(brand, iri_key) %>%
summarise(number_of_promo_weeks_per_store = n())
## filter: removed 17,305 rows (79%), 4,545 rows remaining
## group_by: 2 grouping variables (brand, iri_key)
## summarise: now 2,270 rows and 3 columns, one group variable remaining (brand)
promotions <- promotions %>%
group_by(brand) %>%
summarise(number_of_stores_with_promos = n(),
total_promo_weeks_across_stores = sum(number_of_promo_weeks_per_store))
## group_by: one grouping variable (brand)
## summarise: now 3 rows and 3 columns, ungrouped
promo_stores <- promotions %>%
ggplot(mapping = aes(x = reorder(brand, number_of_stores_with_promos), y = number_of_stores_with_promos, fill = brand )) +
geom_bar(stat = "identity", position = "dodge", show.legend = FALSE) +
labs(title = "Kelloggs runs the most in-store promotions",
subtitle = "Leading in number of stores running promos",
y = "Stores", x = "") +
coord_flip()+
clean_theme +
theme(axis.line.y = element_blank()) +
scale_fill_manual(values = c("darkblue", "#CCCCFF", "#336699"))
promo_weeks <- promotions %>%
ggplot(mapping = aes(x = reorder(brand, total_promo_weeks_across_stores), y = total_promo_weeks_across_stores, fill = brand )) +
geom_bar(stat = "identity", position = "dodge", show.legend = FALSE) +
labs(title = "",
subtitle = "Leading in number of weeks across stores running promos",
y = "Total Promo Weeks", x = "") +
coord_flip()+
clean_theme +
theme(axis.line.y = element_blank()) +
scale_fill_manual(values = c("darkblue", "#CCCCFF", "#336699"))
promo_final <- promo_stores / promo_weeks
ggsave(filename = "promo_final.png", plot = promo_final)
## Saving 7 x 5 in image
promo_final